# Sentiment Analysis

This is to train a classifier on a robust and tagged twitter corpus( obtained from a kaggle challenge ) and allow the model to be used for tagging the scraped tweets  as either 0(negative) or 1(positive).

## Loading and Splitting Data

In [None]:
'''
Get the dataset.
'''
import os
import numpy as np
import pandas as pd
import pprint
from sklearn.model_selection import train_test_split

data = open("kaggleTweets.csv", "rb")
df = pd.read_csv(data, error_bad_lines=False, usecols=['Sentiment', 'SentimentText'])
df.head()

In [None]:
'''
splitting dataset to obtain training and test set.
we will use 80:20 ratio.
'''

train, test = train_test_split(df, test_size=0.2, random_state=42)
print len(train)
print len(test)

In [None]:
'''
drop the sentiment from test set
'''
test_set = test.drop("Sentiment", axis=1)
test_set.head()

## Data Pre-processing

In [None]:
import re

def preprocessor(text):
    
    text = re.sub('<[^>]*>', ' ', text)    # removes HTML from tweets
    text = re.sub('(http|https)://[^ ]+ ', '', text)    # removes all the hyperlinks
    text = re.sub('\s\s+', '', text)    # removes all the extra whitespaces
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P|[^T_T])', text)    #find all emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ''.join(emoticons).replace('-', '')  # appends emmoticons at the end.
    return text

In [None]:
preprocessor(df.loc[1432703, "SentimentText"])    #example

In [None]:
df['SentimentText'] = df['SentimentText'].apply(preprocessor)

In [None]:
df.head()

In [None]:
# Processing into tokens
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer(text):
    return text.split()

def tokenizer_porter(text):
    return [(porter.stem(word)).decode('utf-8') for word in text.split()]

In [None]:
# exemplary run
tokenizer_porter(df.loc[1432703, "SentimentText"])

In [None]:
import nltk
nltk.download("stopwords")

In [None]:
# storing all the stopwords in an array.
from nltk.corpus import stopwords
stop = stopwords.words('english')

## Training a supervised learning classifier

In [None]:
# Prepare dataset to be operated upon by GridSearchCV
X_train = df.loc[:90000, "SentimentText"].values
y_train = df.loc[:90000, "Sentiment"].values
X_test = df.loc[90000:, "SentimentText"].values
y_test = df.loc[90000:, "Sentiment"].values

In [None]:
# Using GridSearchCV to find best parameters to use for classifier(SGDClassifier)
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              ]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=1,
                           n_jobs=-1)

In [None]:
gs_lr_tfidf.fit(X_train, y_train)