## Twitter Sentiment Analysis Supervised Learning Model using NLTK and SVM

In [1]:
import numpy as np
import pandas as pd

### Data Exploration

In [2]:
# reading data
twitter_training_df = pd.read_csv('./Data/twitter_x_y_train.csv')
twitter_testing_df = pd.read_csv('./Data/twitter_x_test.csv')

In [3]:
training_data = twitter_training_df['text']
training_sentiment_clf = twitter_training_df['airline_sentiment']

In [4]:
testing_data = twitter_testing_df['text']

### Data Cleaning

In [5]:
# importing nlp libraries
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [6]:
import string
stops = stopwords.words('english') + list(string.punctuation)

In [7]:
from nltk.corpus import wordnet
def get_simple_pos(tag): #creating simple tags to pass into the lemmatizer
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [8]:
def clean_tweet(words):
    cleaned_words = [] 
    for word in words:
        if word.lower() not in stops:
            lemmatized_word = lemmatizer.lemmatize(word, pos = get_simple_pos(pos_tag([word])[0][1]))
            cleaned_words.append(lemmatized_word.lower())
    return cleaned_words

In [9]:
# joining all cleaned words list
cleaned_training_data = [' '.join(clean_tweet(word_tokenize(tweet))) for tweet in training_data]
cleaned_testing_data = [' '.join(clean_tweet(word_tokenize(tweet))) for tweet in testing_data]

In [10]:
x_train_raw = cleaned_training_data
x_test_raw = cleaned_testing_data
y_train = training_sentiment_clf

### Data Processing

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
# creating sparse matrix of each tweet with 5000 features each
count_vec = CountVectorizer(max_features = 5000, ngram_range = (1, 2))
x_train = count_vec.fit_transform(x_train_raw)
x_test = count_vec.transform(x_test_raw)

### Modeling

In [13]:
from sklearn.svm import SVC

In [14]:
# running support vector machines
svc = SVC(kernel = 'rbf')

In [15]:
from sklearn.model_selection import GridSearchCV

In [16]:
# performing grid search
grid = {
    'C': [1e0, 1e1, 1e2],
    'gamma': [1e-3, 1e-2, 1e-1]
}
cv_svc = GridSearchCV(svc, grid)
cv_svc.fit(x_train, y_train)

GridSearchCV(estimator=SVC(),
             param_grid={'C': [1.0, 10.0, 100.0], 'gamma': [0.001, 0.01, 0.1]})

In [17]:
cv_svc.best_estimator_

SVC(C=10.0, gamma=0.01)

### Model Testing

In [18]:
cv_svc.score(x_train, y_train)

0.9031876138433516

In [19]:
# creating the classification report
from sklearn.metrics import classification_report
report = classification_report(y_train, cv_svc.predict(x_train))
print(report)

              precision    recall  f1-score   support

    negative       0.93      0.96      0.94      6851
     neutral       0.82      0.80      0.81      2327
    positive       0.90      0.83      0.86      1802

    accuracy                           0.90     10980
   macro avg       0.88      0.86      0.87     10980
weighted avg       0.90      0.90      0.90     10980



In [20]:
def classify(text): # function to classify raw text
    return cv_svc.predict(count_vec.transform([' '.join(clean_tweet(word_tokenize(text)))]))

In [21]:
classify('there you go')

array(['neutral'], dtype=object)

In [22]:
y_pred = cv_svc.predict(x_test)

In [23]:
# saving the predictions
np.savetxt('./Twitter Sentiment Analysis.csv', y_pred, delimiter = ',', fmt = '%s')