    In this notebook, we create data frames with both CountVectorizer and TfidfVectorizer. We also explore the
    words which appear in one subreddit and not the other, as well as words most correlated with the target.

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv('../data/clean_data.csv').drop(columns=['text', 'title'])

In [3]:
df.head()

Unnamed: 0,char_count,word_count,clean_titles,target
0,64,10,the supplement rule mate free monday effect,1
1,117,21,what someth mother taught tip life hack etc li...,1
2,161,24,how consciou well groom peopl fingernail appea...,1
3,73,16,ladi love alon get use live sigoth,1
4,49,9,what rememb style wise,1


In [4]:
cvec = CountVectorizer(stop_words='english', min_df=3, max_df=1.0, ngram_range=(1,2))
cvec_df = pd.SparseDataFrame(cvec.fit_transform(df['clean_titles']).toarray(),
                           columns=cvec.get_feature_names()).join(df['target'])

In [5]:
cvec_df.head()

Unnamed: 0,abus,accept,acknowledg,acn,action,activ,actual,admir,adult,advic,...,word,work,world,worst,worth,wrong,year,year old,young,target
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [6]:
tfidf = TfidfVectorizer(stop_words='english', min_df=3, max_df=1.0, ngram_range=(1,2))
tfidf_df = pd.SparseDataFrame(tfidf.fit_transform(df['clean_titles']).toarray(),
                           columns=tfidf.get_feature_names()).join(df['target'])

In [7]:
cvec_df.to_csv('../data/cvec_df.csv', index=False)
tfidf_df.to_csv('../data/tfidf_df.csv', index=False)

In [8]:
term_dict = {word : cvec_df[word].sum() for word in cvec_df.drop('target', axis=1).columns}

tcdf = pd.DataFrame(term_dict, index = ['counts']).T

In [9]:
top_women = list(cvec_df.groupby('target').sum().T.sort_values(1, ascending=False).head(50).index)
top_men = list(cvec_df.groupby('target').sum().T.sort_values(0, ascending=False).head(50).index)

In [10]:
women_only = [word for word in top_women if word not in top_men]
women_only

['ladi',
 'experi',
 'thought',
 'look',
 'happen',
 'realli',
 'tip',
 'women reddit',
 'place',
 'person',
 'wear',
 'consid',
 'opinion']

In [11]:
men_only = [word for word in top_men if word not in top_women]
men_only

['guy',
 'men reddit',
 'man',
 'day',
 'got',
 'ask',
 'hair',
 'long',
 'tell',
 'male',
 'help',
 'attract',
 'stop']

In [12]:
overlap = [word for word in top_men if word in top_women]
overlap

['men',
 'reddit',
 'thing',
 'feel',
 'like',
 'date',
 'someon',
 'girl',
 'relationship',
 'want',
 'make',
 'think',
 'best',
 'friend',
 'sigoth',
 'way',
 'women',
 'life',
 'whi',
 'time',
 'work',
 'favorit',
 'woman',
 'peopl',
 'someth',
 'know',
 'start',
 'sex',
 'good',
 'deal',
 'live',
 'partner',
 'chang',
 'use',
 'love',
 'differ',
 'age']

In [13]:
cvec_df.corr()[['target']].sort_values('target', ascending=False).head(10)

Unnamed: 0,target
target,1.0
women,0.135411
ladi,0.125862
women reddit,0.097677
bodi,0.07816
like,0.074538
thought,0.074224
experienc,0.068876
consid,0.067408
parent,0.064542


In [14]:
cvec_df.corr()[['target']].sort_values('target', ascending=True).head(10)

Unnamed: 0,target
men,-0.241019
men reddit,-0.192544
guy,-0.177014
reddit,-0.109962
fuck,-0.086127
wife,-0.086127
want,-0.082354
pick,-0.080532
girlfriend,-0.07849
beard,-0.074529


In [15]:
top_women = list(tfidf_df.groupby('target').sum().T.sort_values(1, ascending=False).head(50).index)
top_men = list(tfidf_df.groupby('target').sum().T.sort_values(0, ascending=False).head(50).index)

In [16]:
women_only = [word for word in top_women if word not in top_men]
women_only

['ladi',
 'thought',
 'experi',
 'tip',
 'happen',
 'look',
 'opinion',
 'realli',
 'person',
 'differ',
 'women reddit',
 'place',
 'friendship',
 'turn',
 'bodi',
 'age',
 'mani']

In [17]:
men_only = [word for word in top_men if word not in top_women]
men_only

['guy',
 'men reddit',
 'man',
 'deal',
 'day',
 'sex',
 'live',
 'ask',
 'got',
 'hair',
 'stop',
 'fight',
 'help',
 'tell',
 'attract',
 'girlfriend',
 'activ']

In [18]:
overlap = [word for word in top_men if word in top_women]
overlap

['men',
 'reddit',
 'thing',
 'like',
 'feel',
 'girl',
 'date',
 'best',
 'make',
 'favorit',
 'want',
 'someon',
 'relationship',
 'friend',
 'sigoth',
 'think',
 'women',
 'way',
 'life',
 'whi',
 'time',
 'work',
 'woman',
 'peopl',
 'start',
 'good',
 'use',
 'someth',
 'love',
 'know',
 'wear',
 'chang',
 'partner']

In [19]:
tfidf_df.corr()[['target']].sort_values('target', ascending=False).head(10)

Unnamed: 0,target
target,1.0
ladi,0.122767
women,0.108697
women reddit,0.094636
thought,0.076447
bodi,0.076123
tip,0.073555
experienc,0.067788
experi like,0.061753
high,0.060795


In [20]:
tfidf_df.corr()[['target']].sort_values('target', ascending=True).head(10)

Unnamed: 0,target
men,-0.221103
men reddit,-0.188517
guy,-0.172816
reddit,-0.099254
wife,-0.085281
girl,-0.081764
fuck,-0.08153
girlfriend,-0.08013
pick,-0.078846
nsfw,-0.074066
