In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pylab as plt
import numpy as np
import re
import string
from nltk.stem import PorterStemmer
import nltk

import datetime
print(datetime.datetime.now())

from sklearn.model_selection import train_test_split


2020-10-12 17:47:04.201082


In [2]:
class clean_text:
        
    def split_text(self, t):
        return t.apply(lambda x: str(x).split(" "))
    
    def to_lower(self, t):
        return t.apply(lambda x:   str(x).lower())
    
    
    def remove_mentions(self,t):
        return t.apply(lambda x:  re.sub(r'@\w+', '', str(x)))
    
    def remove_numbers(self, t):
        return t.apply(lambda x: re.sub(r'\d+', '', str(x)))
    

    def remove_urls(self, t):
        return t.apply(lambda x: re.sub(r'http.?://[^\s]+[\s]?', '', str(x)))
    
    def remove_punctuation(self,t):
         return t.apply(lambda x: str(x).translate(str.maketrans('','',string.punctuation)))
        

    def remove_stopwords(self, t):
        return t.apply(lambda x: [word for word in str(x).split(" ") if word not in stopwords.words('english')])
    
    def stemming(self,t):
        temp= t.apply(lambda x: str(x).split(" "))
        porter = PorterStemmer()
        stemmed = temp.apply(lambda x: porter.stem(" ".join(x)))
        return stemmed
    
    def ready_data(self, t):
        t1 = self.remove_mentions(t)
        t2 = self.remove_urls(t1)
        t3 = self.remove_punctuation(t2)
        t4 = self.to_lower(t3)
        t5 = self.remove_numbers(t4)
        t6 = self.stemming(t5)
        
        return t6

In [3]:
a = pd.read_csv('Downloads/sentiment_train.csv')
b = pd.read_csv('Downloads/sentiment_test.csv')
X_train = a['Sentence']
y_train = a['Polarity']
X_test = b['Sentence']
y_test = b['Polarity']

Pipeline to extract features using TF-IDF Vectorizer and compare classifiers 

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 
import numpy as np
from time import time


def acc_summary(pipeline, x_train, y_train, x_test, y_test):
    t0 = time()
    sentiment_fit = pipeline.fit(x_train, y_train)
    y_pred = sentiment_fit.predict(x_test)
    train_test_time = time() - t0
    accuracy = accuracy_score(y_test, y_pred)
    print("accuracy score: {0:.2f}%".format(accuracy*100))
    print ("train and test time: {0:.2f}s".format(train_test_time))
    print ("-"*80)
    return accuracy, train_test_time


from sklearn.feature_extraction.text import TfidfVectorizer
tvec = TfidfVectorizer()

from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import Perceptron
from sklearn.neighbors import NearestCentroid
from sklearn.feature_selection import SelectFromModel

names = ["Logistic Regression", "Linear SVC", "LinearSVC with L1-based feature selection","Multinomial NB", 
         "Bernoulli NB", "Ridge Classifier", "AdaBoost", "Perceptron","Passive-Aggresive", "Nearest Centroid"]
classifiers = [
    LogisticRegression(),
    LinearSVC(),
    Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False))),
  ('classification', LinearSVC(penalty="l2"))]),
    MultinomialNB(),
    BernoulliNB(),
    RidgeClassifier(),
    AdaBoostClassifier(),
    Perceptron(),
    PassiveAggressiveClassifier(),
    NearestCentroid()
    ]
zipped_clf = zip(names,classifiers)

tvec = TfidfVectorizer()
def classifier_comparator(vectorizer=tvec, n_features=10000, stop_words=None, ngram_range=(1, 1), classifier=zipped_clf):
    result = []
    vectorizer.set_params(stop_words=stop_words, max_features=n_features, ngram_range=ngram_range)
    for n,c in classifier:
        checker_pipeline = Pipeline([
            ('vectorizer', vectorizer),
            ('classifier', c)
        ])
        print ("Validation result for {}".format(n))
        print (c)
        clf_acc,tt_time = acc_summary(checker_pipeline, X_train, y_train, X_test, y_test)
        result.append((n,clf_acc,tt_time))
    return result

trigram_result = classifier_comparator(n_features=100000,ngram_range=(1,3))

Validation result for Logistic Regression
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
accuracy score: 75.17%
train and test time: 0.32s
--------------------------------------------------------------------------------
Validation result for Linear SVC
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)
accuracy score: 78.17%
train and test time: 0.16s
--------------------------------------------------------------------------------
Validation result for LinearSVC with L1-based feature selection
Pipeline(memory=None,
         steps=[('fea

Pipeline to predict using with LinearSVC since it had the best accuracy

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 
import numpy as np
from time import time


def acc_summary(pipeline, x_train, y_train, x_test, y_test):
    t0 = time()
    sentiment_fit = pipeline.fit(x_train, y_train)
    y_pred = sentiment_fit.predict(x_test)
    train_test_time = time() - t0
    accuracy = accuracy_score(y_test, y_pred)
    print("accuracy score: {0:.2f}%".format(accuracy*100))
    print ("train and test time: {0:.2f}s".format(train_test_time))
    print ("-"*80)
    return accuracy, train_test_time, y_pred


from sklearn.feature_extraction.text import TfidfVectorizer
tvec = TfidfVectorizer()

from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import Perceptron
from sklearn.neighbors import NearestCentroid
from sklearn.feature_selection import SelectFromModel

names = [ "Linear SVC"]
classifiers = [
    LinearSVC() ]
zipped_clf = zip(names,classifiers)

tvec = TfidfVectorizer()
def classifier_comparator(vectorizer=tvec, n_features=10000, stop_words=None, ngram_range=(1, 1), classifier=zipped_clf):
    result = []
    vectorizer.set_params(stop_words=stop_words, max_features=n_features, ngram_range=ngram_range)
    for n,c in classifier:
        checker_pipeline = Pipeline([
            ('vectorizer', vectorizer),
            ('classifier', c)
        ])
        print ("Validation result for {}".format(n))
        print (c)
        clf_acc,tt_time,y_pred = acc_summary(checker_pipeline, X_train, y_train, X_test, y_test)
        result.append((n,clf_acc,tt_time))
    return result, y_pred

trigram_result,y_pred = classifier_comparator(n_features=100000,ngram_range=(1,3))

Validation result for Linear SVC
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)
accuracy score: 78.17%
train and test time: 0.13s
--------------------------------------------------------------------------------


In [10]:
#Setting the predicted values as a column in test dataset
b['Predicted'] = y_pred
b.head()

Unnamed: 0,Sentence,Polarity,predicted,Predicted
0,A good commentary of today's love and undoubte...,1,1,1
1,For people who are first timers in film making...,1,1,1
2,"It was very popular when I was in the cinema, ...",1,1,1
3,It's a feel-good film and that's how I felt wh...,1,0,0
4,It has northern humour and positive about the ...,1,1,1
