In [1]:
#Import the required libraries

import pandas as pd
import numpy as np
# For visualizations
import matplotlib.pyplot as plt
import seaborn as sns
# For regular expressions
import re
# For handling string
import string
# For performing mathematical operations
import math
import matplotlib.pyplot as plt
# For missing values
import missingno as msg
#For datetime
import datetime
# For handling warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# read the engineered dataset

df=pd.read_csv("engineered_train.csv",encoding='utf-8')
df.head()

Unnamed: 0,body,downs,created_utc,score,author,distinguished,archived,subreddit,author_flair_css_class,author_flair_text,...,ups,controversiality,edited,body_clean,positive_sentiment,neutral_sentiment,negative_sentiment,compound_sentiment,nb_chars,nb_words
0,Most of us have some family members like this....,0,1420070400,14,YoungModern,,False,exmormon,,,...,14,0.0,True,family member like family like,0.625,0.375,0.0,0.6124,30,5
1,But Mill's career was way better. Bentham is l...,0,1420070400,3,RedCoatsForever,,False,CanadaPolitics,on,Ontario,...,3,0.0,True,mill's career way well bentham like joseph smi...,0.338,0.662,0.0,0.5574,69,11
2,"Mine uses a strait razor, and as much as i lov...",0,1420070400,1,vhisic,,False,AdviceAnimals,,,...,1,0.0,True,mine use strait razor much love clipper love r...,0.363,0.563,0.074,0.8481,109,21
3,"Very fast, thank you!",0,1420070400,2,Mastersimpson,,False,freedonuts,,,...,2,0.0,True,fast thank,0.714,0.286,0.0,0.3612,10,2
4,"The guy is a professional, and very good at wh...",0,1420070400,6,BigGupp1,,False,WTF,,,...,6,0.0,True,guy professional good highly doubt miss often,0.251,0.346,0.404,-0.1953,45,7


## Feature Selection

In [3]:
# create doc2vec vector columns
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(df["body_clean"].apply(lambda x: str(x).split(" ")))]

# train a Doc2Vec model with our text data
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

# transform each document into a vector data
doc2vec_df = df["body_clean"].apply(lambda x: model.infer_vector(str(x).split(" "))).apply(pd.Series)
doc2vec_df.columns = ["doc2vec_vector_" + str(x) for x in doc2vec_df.columns]
model_df = pd.concat([df, doc2vec_df], axis=1)


The next step consist in extracting vector representations for every comment. The module Gensim creates a numerical vector representation of every word in the corpus by using the contexts in which they appear (Word2Vec). This is performed using shallow neural networks. What's interesting is that similar words will have similar representation vectors.

Each text can also be transformed into numerical vectors using the word vectors (Doc2Vec). Same texts will also have similar representations and that is why we can use those vectors as training features.

We first have to train a Doc2Vec model by feeding in our text data. By applying this model on our comment, we can get those representation vectors.

In [4]:
model_df.head()

Unnamed: 0,body,downs,created_utc,score,author,distinguished,archived,subreddit,author_flair_css_class,author_flair_text,...,neutral_sentiment,negative_sentiment,compound_sentiment,nb_chars,nb_words,doc2vec_vector_0,doc2vec_vector_1,doc2vec_vector_2,doc2vec_vector_3,doc2vec_vector_4
0,Most of us have some family members like this....,0,1420070400,14,YoungModern,,False,exmormon,,,...,0.375,0.0,0.6124,30,5,-0.04737,-0.059626,-0.036554,-0.056787,0.045656
1,But Mill's career was way better. Bentham is l...,0,1420070400,3,RedCoatsForever,,False,CanadaPolitics,on,Ontario,...,0.662,0.0,0.5574,69,11,0.058666,0.030832,0.136548,-0.130439,0.023539
2,"Mine uses a strait razor, and as much as i lov...",0,1420070400,1,vhisic,,False,AdviceAnimals,,,...,0.563,0.074,0.8481,109,21,0.002765,-0.00088,0.000634,0.077377,0.199648
3,"Very fast, thank you!",0,1420070400,2,Mastersimpson,,False,freedonuts,,,...,0.286,0.0,0.3612,10,2,0.031561,-0.101834,0.045002,0.107371,0.076499
4,"The guy is a professional, and very good at wh...",0,1420070400,6,BigGupp1,,False,WTF,,,...,0.346,0.404,-0.1953,45,7,0.043643,0.043136,0.045022,0.089806,-0.150516


In [5]:
model_df.shape

(178525, 26)

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder=LabelEncoder()

encoding_col=[]

df[encoding_col]=label_encoder.fit_transform(df[encoding_col])


Now let's print some wordclouds to have a glimpse at what kind of words apear in our reviews:

## Modelling

In [33]:
# feature selection
label = "score"
ignore_cols = [label, "body", "body_clean",'author', 'author_flair_css_class', 'author_flair_text', 'subreddit','distinguished']
features = [c for c in model_df.columns if c not in ignore_cols]

# split the data into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(model_df[features], model_df[label], test_size = 0.20, random_state = 42)

We first choose which features we want to use to train our model. Then we split our data into two parts:

1. one to train our model
2. one to assess its performances



In [38]:
#Training the model
from sklearn.ensemble import RandomForestClassifier

# to impute any missing values
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
it=IterativeImputer(random_state= 42)
X_train_it=pd.DataFrame(it.fit_transform(X_train))


rf = RandomForestClassifier(n_estimators = 1000,
                            max_depth=1, random_state = 42)
rf.fit(X_train_it, y_train)

# show feature importance
feature_importances_df = pd.DataFrame({"feature": features, "importance": rf.feature_importances_}).sort_values("importance", ascending = False)
feature_importances_df.head(20)

Unnamed: 0,feature,importance
4,ups,0.21
15,doc2vec_vector_2,0.201
11,nb_chars,0.151
12,nb_words,0.11
17,doc2vec_vector_4,0.09
13,doc2vec_vector_0,0.059
3,gilded,0.052
14,doc2vec_vector_1,0.05
16,doc2vec_vector_3,0.026
1,created_utc,0.021


In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

logreg=LogisticRegression()

rfe = RFE(logreg, 15)             # running RFE with 13 variables as output
rfe = rfe.fit(X_train_it, y_train)
feature_importances_df = pd.DataFrame({"feature": features, "importance": rfe.feature_importances_}).sort_values("importance", ascending = False)
feature_importances_df.head(20)

KeyboardInterrupt: 

In [None]:
model_df.info()