# SENTIMENT ANALYSIS USING SVM

### Import components

In [5]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import TweetTokenizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix, roc_auc_score, recall_score, precision_score

In [9]:
names = ['polarity', 'tweet_id', 'date', 'query', 'username', 'text']
raw_data = pd.read_csv('training_data.csv', names=names)

As you can see, our data has some unneccessary columns. Let's clean it up and use what's needed.

In [34]:
clean_data = raw_data.loc[:,['text', 'polarity']]
clean_data.describe()


Unnamed: 0,polarity
count,1600000.0
mean,2.0
std,2.000001
min,0.0
25%,0.0
50%,2.0
75%,4.0
max,4.0


Now, let's reduce the size of the data (for practice purposes only, for real life application the bigger the dataset the better). I'll use some 'hack-ish' way of doing it using train_test_split.

In [33]:
unused_data, mini_set = train_test_split(clean_data, test_size=0.02, random_state=1)
mini_set.describe()

Unnamed: 0,polarity
count,32000.0
mean,2.0065
std,2.000021
min,0.0
25%,0.0
50%,4.0
75%,4.0
max,4.0


Further cleanup needed. Let's replace all entries with polarity 4 values with 1 to indicate positive sentiment.

In [35]:
mini_set['sentiment'] = mini_set['polarity'].apply(lambda x: 1 if x == 4 else 0)
mini_set = mini_set.loc[:, ['text', 'sentiment']]

mini_set.describe()

Unnamed: 0,sentiment
count,32000.0
mean,0.501625
std,0.500005
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


### Machine Learning Model

Split data into train and test sets

In [38]:
train, test = train_test_split(mini_set, test_size=0.2, random_state=1)
x_train = train['text'].values
y_train = train['sentiment']
x_test = test['text'].values
y_test = test['sentiment']

In [39]:
def tokenize(text):
    tokenizer = TweetTokenizer()
    return tokenizer.tokenize(text)


def stem(document):
    return (stemmer.stem(w) for w in analyzer(document))


en_stopwords = set(stopwords.words('english'))

vectorizer = CountVectorizer(
    analyzer='word',
    tokenizer=tokenize,
    lowercase=True,
    ngram_range=(1, 1),
    stop_words=en_stopwords)


We are going to use cross validation and grid search to find good hyperparameters for our SVM model. We need to build a pipeline to don't get features from the validation folds when building each training model.

In [40]:
kfolds = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)

In [None]:
np.random.seed(1)

pipeline_svm = make_pipeline(vectorizer, SVC(probability=True, kernel='linear', 
                                             class_weight='balanced'))
grid_svm = 