In [1]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, accuracy_score, confusion_matrix, f1_score
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from nltk.metrics import ConfusionMatrix
import numpy as np
from utils import tokenizer
from functools import reduce
from nltk.classify import NaiveBayesClassifier
import utils2
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
train_path = 'NLP_Data/train.csv'
test_path = 'NLP_Data/test.csv'

In [3]:
rating_features = ['score_1', 'score_2', 'score_3', 'score_4', 'score_5']
target = 'overall'
df = pd.read_csv(train_path)
df[rating_features] = df[rating_features].apply(lambda x: x.fillna(x.mean()))
df = pd.get_dummies(df, columns = ["status", "Place"])
df[["negatives", "positives", "summary"]].fillna(value = "", inplace = True)
df["combined"] = df[["negatives", "positives","summary"]].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
rating_features.extend(['status_Current Employee ',
       'status_Former Employee ', 'Place_startup_1', 'Place_startup_2',
       'Place_startup_3', 'Place_startup_4', 'Place_startup_5',
       'Place_startup_6'])
df.columns

Index(['ID', 'location', 'date', 'job_title', 'summary', 'positives',
       'negatives', 'advice_to_mgmt', 'score_1', 'score_2', 'score_3',
       'score_4', 'score_5', 'score_6', 'overall', 'status_Current Employee ',
       'status_Former Employee ', 'Place_startup_1', 'Place_startup_2',
       'Place_startup_3', 'Place_startup_4', 'Place_startup_5',
       'Place_startup_6', 'combined'],
      dtype='object')

###  score each reviews using NLTK sentiment analyzer  to use them as additional features

In [4]:
df[["negatives", "positives", "summary"]].fillna(value = "", inplace = True)
df["combined"] = df[["negatives", "positives", "summary"]].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
df[["negatives", "positives", "summary"]] = df[["negatives", "positives", "summary"]].astype(str)
df["negatives"] = df["negatives"].apply(lambda x: utils2.tokenizer(x))
df["positives"] = df["positives"].apply(lambda x: utils2.tokenizer(x))
df["combined"] = df["combined"].apply(lambda x: utils2.tokenizer(x))

sia = SIA()

df["neg_score"] = df["negatives"].apply(lambda x: sia.polarity_scores(x)["neg"])
df["pos_score"] = df["positives"].apply(lambda x: sia.polarity_scores(x)["pos"])
rating_features.extend(["summary_score", 'neg_score', 'pos_score'])

In [None]:
rating_features = ['score_1',
 'score_2',
 'score_3',
 'score_4',
 'score_5',
 'status_Current Employee ',
 'status_Former Employee ',
 'Place_startup_1',
 'Place_startup_2',
 'Place_startup_3',
 'Place_startup_4',
 'Place_startup_5',
 'Place_startup_6',
 'summary_score',
 'neg_score',
 'pos_score']

In [None]:
sns.heatmap(df[rating_features + [target]].corr(),annot=True)

In [5]:
df_train, df_test = train_test_split(df, test_size = 0.2, stratify = df[target])
df_train.dropna(subset = rating_features, inplace=True )
df_test.dropna(subset= rating_features, inplace=True)
df_train.shape

(24268, 27)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
tfidf = TfidfVectorizer(sublinear_tf=True, tokenizer = tokenizer, min_df=5, max_df = 0.7)
feature_vec= tfidf.fit_transform(df_train["combined"])

In [8]:
tfidf_1 = TfidfVectorizer(sublinear_tf=True, tokenizer = tokenizer, min_df=2, max_df = 0.9)
tfidf_1.fit_transform(df_train[df_train[target] == 1]["combined"])

tfidf_2 = TfidfVectorizer(sublinear_tf=True, tokenizer = tokenizer, min_df=2, max_df = 0.9)
tfidf_2.fit_transform(df_train[df_train[target] == 2]["combined"])

tfidf_3 = TfidfVectorizer(sublinear_tf=True, tokenizer = tokenizer, min_df=2, max_df = 0.9)
tfidf_3.fit_transform(df_train[df_train[target] == 3]["combined"])

tfidf_4 = TfidfVectorizer(sublinear_tf=True, tokenizer = tokenizer, min_df=2, max_df = 0.9)
tfidf_4.fit_transform(df_train[df_train[target] == 4]["combined"])

tfidf_5 = TfidfVectorizer(sublinear_tf=True, tokenizer = tokenizer, min_df=2, max_df = 0.9)
tfidf_5.fit_transform(df_train[df_train[target] == 5]["combined"])

<4780x4328 sparse matrix of type '<class 'numpy.float64'>'
	with 106783 stored elements in Compressed Sparse Row format>

In [9]:
df.overall.value_counts()

4.0    10688
3.0     9510
5.0     5975
2.0     3531
1.0      632
Name: overall, dtype: int64

In [10]:
n = 2000
feature_array_1 = np.array(tfidf_1.get_feature_names())
tfidf_sorting_1 = np.argsort(tfidf_1.idf_)[:-1]
top_1 = set(feature_array_1[tfidf_sorting_1][:n])

n = 200
feature_array_2 = np.array(tfidf_2.get_feature_names())
tfidf_sorting_2 = np.argsort(tfidf_2.idf_)[:-1]
top_2 = set(feature_array_2[tfidf_sorting_2][:n])

n = 200
feature_array_3 = np.array(tfidf_3.get_feature_names())
tfidf_sorting_3 = np.argsort(tfidf_3.idf_)[:-1]
top_3 = set(feature_array_3[tfidf_sorting_1][:n])

n = 100
feature_array_4 = np.array(tfidf_4.get_feature_names())
tfidf_sorting_4 = np.argsort(tfidf_4.idf_)[:-1]
top_4 = set(feature_array_4[tfidf_sorting_1][:n])

n = 200
feature_array_5 = np.array(tfidf_5.get_feature_names())
tfidf_sorting_5 = np.argsort(tfidf_5.idf_)[:-1]
top_5 = set(feature_array_5[tfidf_sorting_5][:n])
# feature_array_YN = np.array(tfidf_YN.get_feature_names())
# tfidf_sorting_YN = np.argsort(tfidf_YN.idf_)[:-1]

word_features =reduce(set.union, [top_1, top_2, top_3, top_4, top_5]) - reduce(set.intersection, [top_1, top_2, top_3, top_4, top_5])

In [11]:
word_features = word_features.intersection(tfidf.get_feature_names())
feature_index = [tfidf.get_feature_names().index(word) for word in list(word_features)]

In [12]:
features = np.array(feature_vec.toarray()[:, list(feature_index)])
features =np.concatenate((features, np.array(df_train[rating_features].values)), axis = 1)

In [13]:
# model = LinearRegression()
# _ = model.fit(features, df_train[target].values)
# pred = model.predict(df_test[rating_features].values)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import cross_val_score
models = [
    RandomForestClassifier(n_estimators=50, max_depth=5, random_state=0),
    SVC(),
    LinearSVC(),
    LogisticRegression(random_state=0),
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, features, list(df_train[target].values), scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
import seaborn as sns
sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
              size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()

In [16]:
test_vec = tfidf.transform(df_test["combined"])
test_features = np.array(test_vec.toarray()[:, list(feature_index)])
test_features =np.concatenate((test_features, np.array(df_test[rating_features].values)), axis = 1)

In [17]:
test_features.shape

(6068, 1900)

In [18]:
model = RandomForestClassifier(n_estimators=200, max_depth=8, random_state=0)
_ = model.fit(features, list(df_train[target].values))

pred = model.predict(test_features)
accuracy_score(df_test[target].values, pred)

0.3961766644693474

In [19]:
print(f1_score(df_test[target].values, pred, average='macro'))

0.1791552923543597


In [20]:
print(ConfusionMatrix(list(df_test[target].values), list(pred)))

    |    1    2    3    4    5 |
    |    .    .    .    .    . |
    |    0    0    0    0    0 |
----+--------------------------+
1.0 |   <.>   .   97   30    . |
2.0 |    .   <.> 450  256    . |
3.0 |    .    . <623>1279    . |
4.0 |    .    .  357<1781>   . |
5.0 |    .    .   37 1158   <.>|
----+--------------------------+
(row = reference; col = test)



In [11]:
df[['score_1', 'score_2', 'score_3', 'score_4', 'score_5', "summary_score", 'overall']].corr()

Unnamed: 0,score_1,score_2,score_3,score_4,score_5,summary_score,overall
score_1,1.0,0.575368,0.466423,0.418864,0.572163,0.232032,0.613176
score_2,0.575368,1.0,0.583594,0.466312,0.714954,0.286793,0.76017
score_3,0.466423,0.583594,1.0,0.510693,0.62955,0.223543,0.692272
score_4,0.418864,0.466312,0.510693,1.0,0.476415,0.227571,0.542776
score_5,0.572163,0.714954,0.62955,0.476415,1.0,0.245949,0.728527
summary_score,0.232032,0.286793,0.223543,0.227571,0.245949,1.0,0.26873
overall,0.613176,0.76017,0.692272,0.542776,0.728527,0.26873,1.0


In [246]:
rating_features = ['score_1', 'score_2', 'score_3', 'score_4', 'score_5']
target = 'overall'
df1 = pd.read_csv(test_path)
df1[rating_features] = df1[rating_features].apply(lambda x: x.fillna(x.mean()))
df1 = pd.get_dummies(df1, columns = ["status", "Place"])
df1[["negatives", "positives", "summary"]].fillna(value = "", inplace = True)
df1["combined"] = df1[["negatives", "positives","summary"]].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
rating_features.extend(['status_Current Employee ',
       'status_Former Employee ', 'Place_startup_1', 'Place_startup_2',
       'Place_startup_3', 'Place_startup_4', 'Place_startup_5',
       'Place_startup_6'])
df1.columns


df1[["negatives", "positives", "summary"]].fillna(value = "", inplace = True)
df1["combined"] = df1[["negatives", "positives", "summary"]].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
df1[["negatives", "positives", "summary"]] = df1[["negatives", "positives", "summary"]].astype(str)
df1["negatives"] = df1["negatives"].apply(lambda x: utils2.tokenizer(x))
df1["positives"] = df1["positives"].apply(lambda x: utils2.tokenizer(x))
df1["combined"] = df1["combined"].apply(lambda x: utils2.tokenizer(x))

sia = SIA()

df1["summary_score"] = df1["combined"].apply(lambda x: sia.polarity_scores(x)["compound"])
df1["neg_score"] = df1["negatives"].apply(lambda x: sia.polarity_scores(x)["neg"])
df1["pos_score"] = df1["positives"].apply(lambda x: sia.polarity_scores(x)["pos"])
rating_features.extend(["summary_score", 'neg_score', 'pos_score'])

In [247]:
test_vec = tfidf.transform(df1["combined"])
test_features = np.array(test_vec.toarray()[:, list(feature_index)])
test_features =np.concatenate((test_features, np.array(df1[rating_features].values)), axis = 1)

In [324]:
predictions = model.predict(df1[rating_features].values)

In [325]:
ser = {"ID":df1["ID"].values, 'overall': predictions}
df_pred = pd.DataFrame(ser)

In [326]:
df_pred.to_csv('submission.csv', index = False)

In [108]:
cv = CountVectorizer(tokenizer = tokenizer, min_df=1, max_df=0.9)
count_vec = cv.fit_transform(df_train["negatives"].values)

In [134]:
cv1 = CountVectorizer(tokenizer = tokenizer, min_df=5, max_df = 0.90)
count_vec1 = cv1.fit_transform(df_train[df_train[target] == 1]["negatives"])

cv2 = CountVectorizer(tokenizer = tokenizer, min_df=5, max_df = 0.90)
count_vec2 = cv2.fit_transform(df_train[df_train[target] == 2]["negatives"])

cv3 = CountVectorizer(tokenizer = tokenizer, min_df=5, max_df = 0.90)
count_vec3 = cv3.fit_transform(df_train[df_train[target] == 3]["negatives"])

cv4 = CountVectorizer(tokenizer = tokenizer, min_df=5, max_df = 0.90)
count_vec4 =  cv4.fit_transform(df_train[df_train[target] == 4]["negatives"])

cv5 = CountVectorizer(tokenizer = tokenizer, min_df=5, max_df = 0.90)
count_vec5 = cv5.fit_transform(df_train[df_train[target] == 5]["negatives"])

In [135]:
words_1 = {word:word_count  for word, word_count in  zip(cv1.get_feature_names(),count_vec1.toarray().sum(axis=0)) }
words_1 = dict(sorted(words_1.items(), key = lambda x: x[1], reverse=True)[:100])
words_1 = set([x[0] for x in words_1.items()])

words_2 = {word:word_count  for word, word_count in  zip(cv2.get_feature_names(),count_vec2.toarray().sum(axis=0)) }
words_2 = dict(sorted(words_2.items(), key = lambda x: x[1], reverse=True)[:100])
words_2 = set([x[0] for x in words_2.items()])

words_3 = {word:word_count  for word, word_count in  zip(cv1.get_feature_names(),count_vec3.toarray().sum(axis=0)) }
words_3 = dict(sorted(words_3.items(), key = lambda x: x[1], reverse=True)[:10])
words_3 = set([x[0] for x in words_3.items()])

words_4 = {word:word_count  for word, word_count in  zip(cv4.get_feature_names(),count_vec4.toarray().sum(axis=0)) }
words_4 = dict(sorted(words_4.items(), key = lambda x: x[1], reverse=True)[:100])
words_4 = set([x[0] for x in words_4.items()])


words_5 = {word:word_count  for word, word_count in  zip(cv5.get_feature_names(),count_vec5.toarray().sum(axis=0)) }
words_5 = dict(sorted(words_5.items(), key = lambda x: x[1], reverse=True)[:100])
words_5 = set([x[0] for x in words_5.items()])

top_words = [words_1, words_2,words_3,words_4,words_5]

word_features1 = reduce(set.union, top_words) - reduce(set.intersection, top_words)

# from itertools import combinations 
# for words1, words2 in combinations(top_words, 2):
#     word_features1 = reduce(set.union, [word_features1]) - reduce(set.intersection, [words1, words2])
# word_features1 = reduce(set.union, top_words) - reduce(set.intersection, [words_5, words_4])
# word_features1 = reduce(set.union, top_words) - reduce(set.intersection, [words_3, words_4])
# word_features1 = reduce(set.union, top_words) - reduce(set.intersection, [words_2, words_4])
# word_features1 = reduce(set.union, top_words) - reduce(set.intersection, [words_1, words_4])

In [137]:
def extract_features(corpus, features, labels = None):
        ###
        if not(labels is None):
            corpus = [tokenizer(sentence) for sentence, label in zip(corpus, labels)]
            feature_set = [({token: token in tokens for token in features}, label) for tokens, label in zip(corpus, labels)]
        else:
            corpus = [tokenizer(sentence) for sentence in corpus]
            feature_set = [{token: token in tokens for token in features} for tokens in corpus]
        return feature_set

In [138]:
feature_set = extract_features(df_train["negatives"], word_features1, labels = df_train[target])
classifier = NaiveBayesClassifier.train(feature_set)
df_test.dropna(subset=["negatives"], inplace=True)

In [139]:
test_features = extract_features(df_test["negatives"].values, word_features1)
predictions = classifier.classify_many(test_features)
print(ConfusionMatrix(list(df_test[target].values), list(predictions)))
print(accuracy_score(df_test[target].values, predictions))

    |    1    2    3    4    5 |
    |    .    .    .    .    . |
    |    0    0    0    0    0 |
----+--------------------------+
1.0 |   <2>   7   84   30    4 |
2.0 |   11  <24> 405  241   25 |
3.0 |   15   19 <945> 816  107 |
4.0 |   14   12  938<1003> 171 |
5.0 |    3    8  477  579 <128>|
----+--------------------------+
(row = reference; col = test)

0.34640738299274887


In [140]:
print(f1_score(df_test[target].values, predictions, average='macro'))

0.21145715566128115
