In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import cross_val_score, validation_curve
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer


In [20]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='utf-8', ngram_range=(1, 3), stop_words='english')


In [8]:
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\siddh\AppData\Roaming\nltk_data...


In [21]:
X_train = []
X_test = []
Y_train = []
Y_test = []
X_valid = []
Y_valid = []

In [22]:
male = []
female = []

In [23]:
with open(r'data/training_set.txt') as training:
    for line in training:
        line = line.strip()
        lnum, chr_id, movie_id, chr_name, chr_gender, line_text, credit_list = line.split("+++$+++")
        chr_gender = chr_gender.strip()
#         ss = sid.polarity_scores(line_text)
#         sentiment = ss['compound']
        if(chr_gender.lower() == "m"):
            X_train.append(line_text)
            male.append(line_text)
            Y_train.append(0)
        elif(chr_gender.lower() == "f"):
            X_train.append(line_text)
            female.append(line_text)
            Y_train.append(1)


with open(r'data/test_set.txt') as test:
    for line in test:
        line = line.strip()
        lnum, chr_id, movie_id, chr_name, chr_gender, line_text, credit_list = line.split("+++$+++")
        chr_gender = chr_gender.strip()
#         ss = sid.polarity_scores(line_text)
#         sentiment = ss['compound']
        if(chr_gender.lower() == "m"):
            X_test.append(line_text)
            male.append(line_text)
            Y_test.append(0)
        elif(chr_gender.lower() == "f"):
            X_test.append(line_text)
            female.append(line_text)
            Y_test.append(1)




with open(r'data/validation_set.txt') as valid:
    for line in valid:
        line = line.strip()
        lnum, chr_id, movie_id, chr_name, chr_gender, line_text, credit_list = line.split("+++$+++")
        chr_gender = chr_gender.strip()
#         ss = sid.polarity_scores(line_text)
#         sentiment = ss['compound']
        if(chr_gender.lower() == "m"):
            X_valid.append(line_text)
            male.append(line_text)
            Y_valid.append(0)
        elif(chr_gender.lower() == "f"):
            X_valid.append(line_text)
            female.append(line_text)
            Y_valid.append(1)


In [24]:
X = X_train + X_test + X_valid
Y = Y_train + Y_test + Y_valid

In [25]:
indices = []
for i in range(len(X)):
  if(len(X[i]) > 0):
    answer = True 
  else:
    indices.append(i)

for index in sorted(indices, reverse=True):
    del X[index]
    del Y[index]

In [26]:
df = pd.DataFrame()

df['text'] = X
df['target'] = Y

In [27]:
def get_top_data(top_n = 30000):
  top_data_df_male = df[df['target'] == 0].head(top_n)
  top_data_df_female = df[df['target'] == 1].head(top_n)
  data_df_small = pd.concat([top_data_df_male, top_data_df_female])
  return data_df_small

top_data_df_small = get_top_data(top_n=Y.count(1))

In [28]:
from sklearn.model_selection import train_test_split
def split_train_test(top_data_df_small, test_size=0.1, shuffle_state=True):
    X_train, X_test, Y_train, Y_test = train_test_split(top_data_df_small[['text']], 
                                                        top_data_df_small['target'], 
                                                        shuffle=shuffle_state,
                                                        test_size=test_size, 
                                                        random_state=15)
    print("Value counts for Train genders")
    print(Y_train.value_counts())
    print("Value counts for Test genders")
    print(Y_test.value_counts())
    print(type(X_train))
    print(type(Y_train))
    X_train = X_train.reset_index()
    X_test = X_test.reset_index()
    Y_train = Y_train.to_frame()
    Y_train = Y_train.reset_index()
    Y_test = Y_test.to_frame()
    Y_test = Y_test.reset_index()
    # print(X_train.head())
    return X_train, X_test, Y_train, Y_test

In [29]:
X_train, X_test, Y_train, Y_test = split_train_test(df)
X_train = X_train['text']
Y_train = Y_train['target']
X_test = X_test['text']
Y_test = Y_test['target']

Value counts for Train genders
0    153706
1     64114
Name: target, dtype: int64
Value counts for Test genders
0    17062
1     7141
Name: target, dtype: int64
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [31]:
count_vect = CountVectorizer(ngram_range=(1,3))

X_train_counts = count_vect.fit_transform(X)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print("Count Vectorized")

Count Vectorized


In [73]:
sentimentX = []
for line in X:
    ss = sid.polarity_scores(line)
    sentiment = ss['compound']
    sentimentX.append(sentiment)

In [67]:
#Add proportion of proper noun to total as something soon as well!

In [68]:
X_train_counts

<242023x1834555 sparse matrix of type '<class 'numpy.int64'>'
	with 6176735 stored elements in Compressed Sparse Row format>

In [None]:
for element in X_train_counts:
    print(element)

  (0, 1502251)	1
  (0, 358832)	1
  (0, 1502206)	1
  (0, 1008864)	1
  (0, 356084)	1
  (0, 1500513)	1
  (0, 1502273)	1
  (0, 360473)	1
  (0, 1548636)	1
  (0, 1502206)	1
  (0, 356084)	1
  (0, 1500513)	1
  (0, 656058)	1
  (0, 1316591)	1
  (0, 655792)	1
  (0, 1284070)	1
  (0, 1061706)	1
  (0, 1279836)	1
  (0, 819395)	1
  (0, 524957)	1
  (0, 818894)	1
  (0, 1764832)	1
  (0, 664540)	1
  (0, 811720)	1
  (0, 1562234)	1
  (0, 981280)	1
  (0, 537445)	1
  (0, 1190451)	1
  (0, 1804954)	1
  (0, 1063869)	1
  (0, 1562480)	1
  (0, 664456)	1
  (0, 811717)	1
  (0, 1562224)	1
  (0, 981188)	1
  (0, 537421)	1
  (0, 1190354)	1
  (0, 1804377)	1
  (0, 1063841)	1
  (0, 823161)	1
  (0, 661472)	1
  (0, 811641)	1
  (0, 979428)	1
  (0, 535849)	1
  (0, 1186920)	1
  (0, 1780475)	1
  (0, 1061706)	1
  (0, 1548636)	2
  (0, 996207)	1
  (0, 664566)	1
  (0, 794229)	1
  (0, 371678)	1
  (0, 1788741)	1
  (0, 81298)	1
  (0, 1132659)	1
  (0, 1527188)	1
  (0, 158627)	1
  (0, 767847)	1
  (0, 1796293)	1
  (0, 1336339)	1
  (0, 6640