# SENTIMENT ANALYSIS USING SVM

### Import components

In [5]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import TweetTokenizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix, roc_auc_score, recall_score, precision_score

In [6]:
nltk.download()

In [4]:
names = ['sentiment', 'tweet_id', 'date', 'query', 'username', 'text']
raw_data = pd.read_csv('training_data.csv', names=names)

FileNotFoundError: File b'training_data.csv' does not exist

As you can see, our data has some unneccessary columns. Let's clean it up and use what's needed.

In [None]:
clean_data = raw_data.loc[:,['text', 'sentiment']]
clean_data.describe()


Unnamed: 0,polarity
count,1600000.0
mean,2.0
std,2.000001
min,0.0
25%,0.0
50%,2.0
75%,4.0
max,4.0


Now, let's reduce the size of the data (for practice purposes only, for real life application the bigger the dataset the better). I'll use some 'hack-ish' way of doing it using train_test_split.

In [33]:
unused_data, mini_set = train_test_split(clean_data, test_size=0.001, random_state=1)
mini_set.describe()

Unnamed: 0,polarity
count,32000.0
mean,2.0065
std,2.000021
min,0.0
25%,0.0
50%,4.0
75%,4.0
max,4.0


Further cleanup needed. Let's replace all entries with polarity 4 values with 1 to indicate positive sentiment.

In [1]:
mini_set['sentiment'].apply(lambda x: 1 if x == 4 else 0)
mini_set = mini_set.loc[:, ['text', 'sentiment']]

mini_set.describe()

NameError: name 'mini_set' is not defined

### Machine Learning Model

Split data into train and test sets

In [38]:
train, test = train_test_split(mini_set, test_size=0.2, random_state=1)
x_train = train['text'].values
y_train = train['sentiment']
x_test = test['text'].values
y_test = test['sentiment']

In [39]:
def tokenize(text):
    tokenizer = TweetTokenizer()
    return tokenizer.tokenize(text)


def stem(document):
    return (stemmer.stem(w) for w in analyzer(document))


en_stopwords = set(stopwords.words('english'))

vectorizer = CountVectorizer(
    analyzer='word',
    tokenizer=tokenize,
    lowercase=True,
    ngram_range=(1, 1),
    stop_words=en_stopwords)


We are going to use cross validation and grid search to find good hyperparameters for our SVM model. We need to build a pipeline to don't get features from the validation folds when building each training model.

In [40]:
kfolds = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)

In [None]:
np.random.seed(1)

pipeline_svm = make_pipeline(vectorizer, SVC(probability=True, kernel='linear',
                                             class_weight='balanced'))
grid_svm = GridSearchCV(pipeline_svm,
                        param_grid={'svc__C': [0.01, 0.1, 1]},
                        cv=kfolds,
                        scoring="roc_auc",
                        verbose=1,
                        n_jobs=-1)

grid_svm.fit(x_train, y_train)
grid_svm.score(x_test, y_test)


In [None]:
grid_svm.best_params_

In [None]:
grid_svm.best_score_

In [None]:
def report_results(model, x, y):
    pred_proba = model.predict_proba(x)[:, 1]
    pred = model.predict(x)

    auc = roc_auc_score(y, pred_proba)
    acc = accuracy_score(y, pred)
    f1 = f1_score(y, pred)
    prec = precision_score(y, pred)
    rec = recall_score(y, pred)
    result = {'auc': auc, 'f1': f1, 'acc': acc, 'precision': prec, 'recall': rec}
    return result

Let's see how the model works with the test data

In [None]:
report_results(grid_svm.best_estimator_, x_test, y_test)

In [None]:
def get_roc_curve(model, x, y):
    pred_proba = model.predict_proba(x)[:, 1]
    fpr, tpr, _ = roc_curve(y, pred_proba)
    return fpr, tpr

In [None]:
roc_svm = get_roc_curve(grid_svm.best_estimator_, x_test, y_test)

In [None]:
fpr, tpr = roc_svm
plt.figure(figsize=(14,8))
plt.plot(fpr, tpr, color="red")
plt.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Roc curve')
plt.show()

Let's see if our model has some bias or variance problem by ploting its learning curve:

In [None]:
from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores = \
    learning_curve(grid_svm.best_estimator_, x_train, y_train, cv=5, n_jobs=-1,
                   scoring="roc_auc", train_sizes=np.linspace(.1, 1.0, 10), random_state=1)

In [None]:
def plot_learning_curve(x, y, train_sizes, train_scores, test_scores, title='', ylim=None, figsize=(14,8)):

    plt.figure(figsize=figsize)
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="lower right")
    return plt

In [None]:
plot_learning_curve(X_train, y_train, train_sizes,
                    train_scores, test_scores, ylim=(0.7, 1.01), figsize=(14,6))
plt.show()

# Examples

In [None]:
grid_svm.predict(["flying with @united is always a great experience"])

In [None]:
grid_svm.predict(["I love @united. Sorry, just kidding!"])

# Acknowledgements

- Luis Bronchal __[Sentiment analysis with SVM](https://www.kaggle.com/lbronchal/sentiment-analysis-with-svm)__
- Twitter Data __[Stanford Sentiment Treebank](http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip)__