In [1]:
import os
import time
#basic imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from time import time

#word modeling
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize 
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA 
from sklearn.decomposition import FactorAnalysis 
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.manifold import TSNE
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_predict
from gensim.parsing.preprocessing import remove_stopwords
import nltk

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
warnings.filterwarnings('ignore',message="Precision")

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation


In [2]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()


In [3]:
n_features = 5000
n_top_words = 20

# load data


In [4]:
train_docs = pd.read_csv('../steam_australia.csv', 
                         names=["id", "funny", "helpful", "recommend",
                               "polarity", "positive", "subjectivity", "review"],
                        header=None).reset_index(drop=True)[1:]
recommend_indices = [i for i, x in enumerate(np.array(train_docs["recommend"])) if type(x) == str]
train_docs

Unnamed: 0,id,funny,helpful,recommend,polarity,positive,subjectivity,review
1,76561197970982479,,No ratings yet,1,0.17444444444444446,True,0.3796031746031746,Simple yet with great replayability. In my opi...
2,js41637,,15 of 20 people (75%) found this review helpful,1,0.11458333333333336,True,0.6604166666666667,I know what you think when you see this title ...
3,evcentric,,No ratings yet,1,0.23154761904761909,True,0.5267857142857143,A suitably punishing roguelike platformer. Wi...
4,doctr,,2 of 2 people (100%) found this review helpful,1,0.25785714285714284,True,0.4723809523809524,This game... is so fun. The fight sequences ha...
5,maplemage,3 people found this review funny,35 of 43 people (81%) found this review helpful,1,0.0,False,0.0,Git gud
...,...,...,...,...,...,...,...,...
25795,76561198306599751,,0 of 1 people (0%) found this review helpful,1,0.39999999999999997,True,0.6000000000000001,I cried in the end its so sadding ]'; I wish l...
25796,Ghoustik,,0 of 1 people (0%) found this review helpful,1,0.0,False,0.0,Gra naprawdę fajna.Ale jest kilka rzeczy do kt...
25797,76561198310819422,1 person found this review funny,1 of 1 people (100%) found this review helpful,1,0.0,False,0.0,Well Done
25798,76561198312638244,,No ratings yet,1,0.2490909090909091,True,0.5509090909090909,this is a very fun and nice 80s themed shooter...


In [5]:
data_samples = train_docs['review']
t0 = time()
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')
tfidf  = tfidf_vectorizer.fit_transform(data_samples.astype('U'))
print("done in %0.3fs." % (time() - t0))


done in 0.827s.


In [6]:
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=1,
                                max_features=n_features,
                                stop_words='english')
tf = tf_vectorizer.fit_transform(data_samples.astype('U'))


In [7]:
true_y = np.array(np.array(train_docs["recommend"])[recommend_indices], dtype=int)
np.unique(true_y)
data = tf.toarray()[recommend_indices]

In [None]:
flda = LinearDiscriminantAnalysis(n_components=1, tol=0.001)
t0 = time()
flda.fit(data, true_y)
print("done in %0.3fs." % (time() - t0))

# Precision

In [None]:
predict_y = flda.predict(tf.toarray()[recommend_indices])
arr = np.concatenate((predict_y.reshape([-1, 1]), true_y.reshape([-1, 1])), 1)

total = len(arr)
corrects = [i for i, x in enumerate(arr) if x[0] == x[1]]
print(len(corrects), "/", total, len(corrects)/total)

In [None]:
def heatconmat(y_true, y_pred):
    sns.set_context('talk')
    plt.figure(figsize=(4,4))
    sns.heatmap(confusion_matrix(y_true,y_pred),
                annot=True,
                fmt='d',
                cbar=False,
                cmap='gist_earth_r')
    plt.show()
    print(classification_report(y_true,y_pred))

In [None]:
heatconmat(true_y, predict_y)

Predicting too many recommendations because of the bias in the data (Need to rectify)

In [None]:
z = flda.transform(tf.toarray()[recommend_indices])
df_y = pd.DataFrame(np.concatenate((arr, z.reshape(-1, 1)),1), columns=["predictions", "true", "z"])
df_y

Recommendation distribution

In [None]:
fig, ax = plt.subplots()
ax.bar([0,1], [len(df_y["true"]) - np.sum(df_y["true"]), np.sum(df_y["true"])])
ax.set_xticks([0, 1])
ax.set_xticklabels(["Not Recommended", "Recommended"])
ax.grid(False)

In [None]:
sns.set(style="whitegrid")
n_true_recommend = sum(df_y["true"])
n_true_preds = sum(df_y["predictions"])
n_false_recommend = len(df_y["true"]) - n_true_recommend
n_false_preds = len(df_y["predictions"]) - n_true_preds
x = np.array([0, 1])

width = 0.35  # the width of the bars

fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, [n_false_recommend, n_true_recommend], width, label='true')
rects2 = ax.bar(x + width/2, [n_false_preds, n_true_preds], width, label='predictions')
ax.set_xticks(x)
ax.set_xticklabels(["Not Recommended", "Recommended"])
ax.legend()
ax.grid(False)

In [None]:
sns.set(style="ticks", color_codes=True)
sns.boxplot(x="true", y="z", data=df_y);

In [None]:
sns.set(style="ticks", color_codes=True)
sns.boxplot(x="predictions", y="z", data=df_y);

In [None]:
sns.set(style="ticks", color_codes=True)
sns.violinplot(x="true", y="z", data=df_y);

In [None]:
sns.set(style="ticks", color_codes=True)
sns.distplot(df_y["z"][df_y["true"] == 1], hist=True, label="Recommended")
sns.distplot(df_y["z"][df_y["true"] == 0], hist=True, label="Not Recommended")

In [None]:
sns.set(style="ticks", color_codes=True)
sns.kdeplot(df_y["z"][df_y["predictions"] == 1], shade=True, label="Recommended")
sns.kdeplot(df_y["z"][df_y["predictions"] == 0], shade=True, label="Not Recommended")

In [None]:
y = np.array(df_y["predictions"])
arr = np.concatenate((y.reshape([-1, 1]), true_y.reshape([-1, 1])), 1)
corrects = [i for i, x in enumerate(arr) if x[0] == x[1]]
corrects_pos = [i for i, x in enumerate(arr) if x[0] == x[1] and x[0] == 1]
corrects_neg = [i for i, x in enumerate(arr) if x[0] == x[1] and x[0] == 0]
sns.set(style="ticks", color_codes=True)
sns.kdeplot(df_y["z"][corrects_pos], shade=True, label="Recommended")
sns.kdeplot(df_y["z"][corrects_neg], shade=True, label="Not Recommended")

In [None]:
incorrects = [i for i, x in enumerate(arr) if x[0] == x[1]]
incorrects_pos = [i for i, x in enumerate(arr) if x[0] != x[1] and x[0] == 1]
incorrects_neg = [i for i, x in enumerate(arr) if x[0] != x[1] and x[0] == 0]
sns.set(style="ticks", color_codes=True)
sns.kdeplot(df_y["z"][incorrects_pos], shade=True, label="Recommended")
sns.kdeplot(df_y["z"][incorrects_neg], shade=True, label="Not Recommended")

In [None]:
def rand_jitter(arr):
    stdev = .01*(max(arr)-min(arr))
    return arr + np.random.randn(len(arr)) * stdev

df_y["true"] = np.array([float(x) for x in df_y["true"]])
df_y["true"] = rand_jitter(np.array(df_y["true"]))
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(df_y["true"][corrects_pos]+0.1, df_y["z"][corrects_pos], marker="o", s=20, facecolors='none', edgecolors='b')
ax.scatter(df_y["true"][corrects_neg]+0.1, df_y["z"][corrects_neg], marker="o", s=20, facecolors='none', edgecolors='b')
ax.scatter(df_y["true"][incorrects_pos]-0.1, df_y["z"][incorrects_pos], marker="o", s=20, facecolors='none', edgecolors='r')
ax.scatter(df_y["true"][incorrects_neg]-0.1, df_y["z"][incorrects_neg], marker="o", s=20, facecolors='none', edgecolors='r')


In [None]:
import seaborn as sns
df_y = pd.DataFrame(np.concatenate((arr, z.reshape(-1, 1)),1), columns=["predictions", "true", "z"])
sns.set(style="whitegrid")
g = sns.catplot(x="true", y="z", hue="predictions", data=df_y,
                height=6, kind="bar", palette="muted")
g.despine(left=True)
g.set_ylabels("z")

In [None]:
import seaborn as sns
df_y = pd.DataFrame(np.concatenate((arr, z.reshape(-1, 1)),1), columns=["predictions", "true", "z"])
# sns.set(style="whitegrid")
g = sns.boxplot(x="true", y="z", hue="predictions", data=df_y, palette="muted")

In [None]:
g = sns.violinplot(x="true", y="z", hue="predictions", data=df_y, palette="muted")

In [None]:
import seaborn as sns
arr = np.concatenate((y.reshape([-1, 1]), true_y.reshape([-1, 1])), 1)
corrects = np.array([0 if x[0] == x[1] else 1 for i, x in enumerate(arr)])
recommends = np.array([0 if x[0] == 0 else 1 for i, x in enumerate(arr)])

df_y2 = pd.DataFrame(np.concatenate((corrects.reshape(-1, 1), 
                                    recommends.reshape(-1, 1), 
                                    z.reshape(-1, 1)),1), 
                    columns=["correct", "recommend", "z"])

sns.set(style="whitegrid")
g = sns.catplot(x="correct", y="z", hue="recommend", data=df_y2,
                height=6, kind="bar", palette="muted")
g.despine(left=True)
g.set_ylabels("z")

In [None]:
g = sns.violinplot(x="correct", y="z", hue="recommend", data=df_y2, palette="muted")

In [None]:
g = sns.boxplot(x="correct", y="z", hue="recommend", data=df_y2, palette="muted")

In [None]:
tfidf_feature_names = np.array(tfidf_vectorizer.get_feature_names()).reshape([-1, 1])
coefs = flda.coef_.reshape([-1, 1])
coefs = pd.DataFrame(np.concatenate((coefs, tfidf_feature_names), 1), columns=["coef", "feature_name"])
coefs

In [None]:
coefs["coef"] = np.array(coefs["coef"], dtype=np.float32)
coefs.sort_values(by=['coef'])


In [None]:
means = flda.means_
means_df = pd.DataFrame(np.concatenate((means.reshape([n_features, -1]), tfidf_feature_names), 1),
                        columns=["bad_reference", "good_reference", "feature_name"])

means_df["bad_reference"] = np.array(means_df["bad_reference"], dtype=np.float32)
means_df = means_df.sort_values(by=['bad_reference'], ascending=False)
means_df[:n_top_words]

In [None]:
means_df["good_reference"] = np.array(means_df["good_reference"], dtype=np.float32)
means_df = means_df.sort_values(by=['good_reference'], ascending=False)
means_df[:n_top_words]

In [None]:
from imblearn.over_sampling import SVMSMOTE, SMOTE, BorderlineSMOTE, RandomOverSampler, KMeansSMOTE, ADASYN

# sm = SVMSMOTE(random_state=42, n_jobs=-1, svm_estimator=SVC(kernel='linear'))
sm = SMOTE(random_state=42, n_jobs=-1)

X_res, y_res = sm.fit_resample(data, true_y)


In [None]:
flda_smote = LinearDiscriminantAnalysis(n_components=1, tol=0.001)
t0 = time()
flda_smote.fit(X_res, y_res)
print("done in %0.3fs." % (time() - t0))

In [None]:
predict_y = flda_smote.predict(X_res)
arr = np.concatenate((predict_y.reshape([-1, 1]), y_res.reshape([-1, 1])), 1)

total = len(arr)
corrects = [i for i, x in enumerate(arr) if x[0] == x[1]]
print(len(corrects), "/", total, len(corrects)/total)

In [None]:
z = flda_smote.transform(X_res)
df_y = pd.DataFrame(np.concatenate((arr, z.reshape(-1, 1)),1), columns=["predictions", "true", "z"])
df_y

In [None]:
fig, ax = plt.subplots()
ax.bar([0,1], [len(df_y["true"]) - np.sum(df_y["true"]), np.sum(df_y["true"])])
ax.set_xticks([0, 1])
ax.set_xticklabels(["Not Recommended", "Recommended"])
ax.grid(False)

In [None]:
true_y2 = np.array(df_y["true"])
heatconmat(true_y2, predict_y)

In [None]:
means = flda_smote.means_
means_df = pd.DataFrame(np.concatenate((means.reshape([n_features, -1]), tfidf_feature_names), 1),
                        columns=["bad_reference", "good_reference", "feature_name"])

means_df["bad_reference"] = np.array(means_df["bad_reference"], dtype=np.float32)
means_df = means_df.sort_values(by=['bad_reference'], ascending=False)
means_df[:n_top_words]

In [None]:
means_df["good_reference"] = np.array(means_df["good_reference"], dtype=np.float32)
means_df = means_df.sort_values(by=['good_reference'], ascending=False)
means_df[:n_top_words]