In [1]:
import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns 

import re
import string
import unicodedata

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

from bs4 import BeautifulSoup

import os

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import os
os.getcwd()

'C:\\Users\\sharg'

# Q2 Task 1a :  Loading , cleaning and Data preprocessing

In [3]:
df = pd.read_csv("Downloads/sentiment_train.csv")
df.info()
df.head()
df.dtypes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2202 entries, 0 to 2201
Data columns (total 2 columns):
Sentence    2202 non-null object
Polarity    2202 non-null int64
dtypes: int64(1), object(1)
memory usage: 34.5+ KB


Unnamed: 0,Sentence,Polarity
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


Sentence    object
Polarity     int64
dtype: object

In [4]:
from sklearn.model_selection import train_test_split

X = df['Sentence']
y = df['Polarity']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
type(X_train)
X_train.shape
X_train.head()

type(y_train)
y_train.shape
y_train.head()

pandas.core.series.Series

(1761,)

1963    I use this product in a motor control center w...
486     They had a toro tartare with a cavier that was...
1192    Lately they have been extremely nice and helpf...
755     When I'm on this side of town, this will defin...
1680                 And none of the tones is acceptable.
Name: Sentence, dtype: object

pandas.core.series.Series

(1761,)

1963    1
486     1
1192    1
755     1
1680    0
Name: Polarity, dtype: int64

# Text Preprocessing

In [6]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import unidecode
import textstat
import string  

lemmer = WordNetLemmatizer()

def my_preprocess(doc):
    
    # Lowercase
    doc = doc.lower()
    
    # Replace URL with URL string
    doc = re.sub(r'http\S+', 'URL', doc)
    
    # Replace AT with AT string
    doc = re.sub(r'@', 'AT', doc)
    
    # Replace all numbers/digits with the string NUM
    doc = re.sub(r'\b\d+\b', 'NUM', doc)
    
    # Lemmatize each word.
    doc = ' '.join([lemmer.lemmatize(w) for w in doc.split()])
    return doc

In [7]:
# These functions will calculate additional features on the document.
from textblob import TextBlob

def doc_length(corpus):
    return np.array([len(doc) for doc in corpus]).reshape(-1, 1)

def lexicon_count(corpus):
    return np.array([textstat.lexicon_count(doc) for doc in corpus]).reshape(-1, 1)


def Sentiment_polarity(corpus):
    return np.array([TextBlob(doc).sentiment.polarity for doc in corpus]).reshape(-1, 1)


# Task 1b : Feature Engineering
custom features were engineered to check if they contribute to improving the model's classification value
Extracting more featires using BOW, N-grams, vertorization techniques

In [8]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import stop_words
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import FunctionTransformer
from sklearn.decomposition import NMF
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

# Need to preprocess the stopwords, because scikit learn's TfidfVectorizer
# removes stopwords _after_ preprocessing
stop_words = [my_preprocess(word) for word in stop_words.ENGLISH_STOP_WORDS]

# This vectorizer will be used to create the BOW features
vectorizer = TfidfVectorizer(preprocessor=my_preprocess, 
                             max_features = 1000, 
                             ngram_range=[1,3],
                             stop_words=None,
                             strip_accents="unicode", 
                             lowercase=False, max_df=0.25, min_df=0.001, use_idf=True)

# This vectorizer will be used to preprocess the text before topic modeling.
# (I _could_ use the same vectorizer as above- but why limit myself?)
vectorizer2 = TfidfVectorizer(preprocessor=my_preprocess, 
                             max_features =1000, 
                             ngram_range=[1,3],
                             stop_words=None,
                             strip_accents="unicode", 
                             lowercase=False, max_df=0.25, min_df=0.001, use_idf=True)

nmf = NMF(n_components= 35, random_state=1, init='nndsvda', solver='mu', alpha=.1, l1_ratio=.5)




# Model building
below Pipeline contains feature processing followed by modelling
Model selection: Random Forest, Logistic Regression, MLP
Data was then fit to each model and cross validation was performed for each model while also tuning hyperparameters for MLP and RF model. however for logistic regression there was no hyper parameter tunning.

In [19]:
rf = RandomForestClassifier(criterion='entropy', random_state=225)
lr= LogisticRegression(class_weight=True, random_state=123)
mlp = MLPClassifier(random_state=42, verbose=2, max_iter=250)

feature_processing =  FeatureUnion([ 
    ('bow', Pipeline([('cv', vectorizer), ])),
    ('topics', Pipeline([('cv', vectorizer2), ('nmf', nmf),])),
    ('length', FunctionTransformer(doc_length, validate=False)),
    ('words', FunctionTransformer(lexicon_count, validate=False)),
    ('Sentiment_polarity', FunctionTransformer(Sentiment_polarity, validate=False)),
])

steps = [('features', feature_processing)]

pipe = Pipeline([('features', feature_processing), ('clf', rf)])

param_grid = {}
which_clf = "lr"

if which_clf == "RF":

    steps.append(('clf', rf))
    param_grid = {
        'features__bow__cv__preprocessor': [None, my_preprocess],
        'features__bow__cv__max_features': [200, 500, 1000],
        'features__bow__cv__use_idf': [False],
        'features__topics__cv__stop_words': [None],
        'features__topics__nmf__n_components': [25, 100],
        'clf__n_estimators': [100, 1000],
        'clf__class_weight': [None],
    }
    
elif which_clf == "MLP":
    
    steps.append(('clf', mlp))
    param_grid = {
        'features__bow__cv__preprocessor': [my_preprocess],
        'features__bow__cv__max_features': [1000, 3000],
        'features__bow__cv__min_df': [0],
        'features__bow__cv__use_idf': [False],
        'features__topics__nmf__n_components': [100,300],
        'clf__hidden_layer_sizes': [(100, ), (50, 50), (25, 25, 25)],
    }

elif which_clf == "lr":
    steps.append(('clf', lr))
    
pipe = Pipeline(steps)

search = GridSearchCV(pipe, param_grid, cv=4, n_jobs=4, scoring='f1_micro', return_train_score=True, verbose=2)

# Fit model with Training dataset

In [20]:
search = search.fit(X_train, y_train)
#pipe.fit(X_train, y_train)

Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    4.9s finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [21]:
print("Best parameter (CV scy_train%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV scy_train0.806):
{}


In [22]:
# Print out the results of hyperparmater tuning

def cv_results_to_df(cv_results):
    results = pd.DataFrame(list(cv_results['params']))
    results['mean_fit_time'] = cv_results['mean_fit_time']
    results['mean_score_time'] = cv_results['mean_score_time']
    results['mean_train_score'] = cv_results['mean_train_score']
    results['std_train_score'] = cv_results['std_train_score']
    results['mean_test_score'] = cv_results['mean_test_score']
    results['std_test_score'] = cv_results['std_test_score']
    results['rank_test_score'] = cv_results['rank_test_score']

    results = results.sort_values(['mean_test_score'], ascending=False)
    return results

results = cv_results_to_df(search.cv_results_)
results
#results.to_csv('results2.csv', index=False)

Unnamed: 0,mean_fit_time,mean_score_time,mean_train_score,std_train_score,mean_test_score,std_test_score,rank_test_score
0,2.562262,0.606038,0.870339,0.00376,0.805812,0.023453,1


# Estimating Model performance on Training data

In [23]:
# Because we are using a pipeline and a GridSearchCV, things are a bit complicated.

# The pipeline with the best performance
pipeline = search.best_estimator_

# Get the feature processing pipeline, so I can use it later
feature_processing_obj = pipeline.named_steps['features']

# Find the vectorizer objects, the NMF objects, and the classifier objects
pipevect= dict(pipeline.named_steps['features'].transformer_list)
vectorizer_obj = pipevect.get('bow').named_steps['cv']
vectorizer_obj2 = pipevect.get('topics').named_steps['cv']
nmf_obj = pipevect.get('topics').named_steps['nmf']
clf_obj = pipeline.named_steps['clf']

# Sanity check - what was vocabSize set to? Should match the output here.
len(vectorizer_obj.get_feature_names())

1000

In [24]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, f1_score

features_val = feature_processing_obj.transform(X_val).todense()

pred_val = search.predict(X_val)

print("Confusion matrix:")
print(confusion_matrix(y_val, pred_val))

print("\nF1 Score = {:.5f}".format(f1_score(y_val, pred_val, average='micro')))

print("\nClassification Report:")
print(classification_report(y_val, pred_val))

Confusion matrix:
[[199  28]
 [ 35 179]]

F1 Score = 0.85714

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.88      0.86       227
           1       0.86      0.84      0.85       214

    accuracy                           0.86       441
   macro avg       0.86      0.86      0.86       441
weighted avg       0.86      0.86      0.86       441



# Estimating model performance in Test data

In [25]:
test_df = pd.read_csv('Downloads/sentiment_test.csv')

features_test = feature_processing_obj.transform(test_df['Sentence']).todense()
pred_test = search.predict(test_df['Sentence'])
test_df = pd.read_csv('Downloads/sentiment_test.csv')

features_test = feature_processing_obj.transform(test_df['Sentence']).todense()
pred_test = search.predict(test_df['Sentence'])

y_test = test_df['Polarity']

print("Confusion matrix:")
print(confusion_matrix(y_test, pred_test))

print("\nF1 Score = {:.5f}".format(f1_score(y_test, pred_test, average="micro")))

print("\nClassification Report:")
print(classification_report(y_test, pred_test))

Confusion matrix:
[[219  30]
 [ 92 205]]

F1 Score = 0.77656

Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.88      0.78       249
           1       0.87      0.69      0.77       297

    accuracy                           0.78       546
   macro avg       0.79      0.78      0.78       546
weighted avg       0.80      0.78      0.78       546



In [26]:
my_submission = pd.DataFrame({'Sentence': test_df.Sentence, 'Sentence_Polarity': test_df.Polarity,'predicted Polarity': pred_test})
my_submission.to_csv('sentiment.csv', index=False)

# Mis_classification Instance

In [27]:
my_submission.head(50)

Unnamed: 0,Sentence,Sentence_Polarity,predicted Polarity
0,A good commentary of today's love and undoubte...,1,1
1,For people who are first timers in film making...,1,1
2,"It was very popular when I was in the cinema, ...",1,1
3,It's a feel-good film and that's how I felt wh...,1,0
4,It has northern humour and positive about the ...,1,1
5,I rather enjoyed it.,1,1
6,I liked it.,1,1
7,I couldn't take them seriously.,0,0
8,It really created a unique feeling though.,1,1
9,Vivian Schilling did an excellent job with the...,1,1
