In [1]:
import re
import pandas as pd
import tensorflow as tf
from sklearn.metrics import *
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import matplotlib.patches as mpatches
from nltk.tokenize import RegexpTokenizer
from sklearn.naive_bayes import ComplementNB
from sklearn.feature_extraction.text import CountVectorizer

# Naive Bayes

## Import Data

In [2]:
df = pd.read_csv("data/training.1600000.processed.noemoticon.csv",
                names=['polarity', 'id', 'date', 'query', 'user', 'text'],
                encoding='latin-1',
                usecols=['polarity', 'text'])

test_df = pd.read_csv("data/testdata.manual.2009.06.14.csv",
                names=['polarity', 'id', 'date', 'query', 'user', 'text'],
                encoding='latin-1',
                usecols=['polarity', 'text'])

test_df = test_df[test_df['polarity'] != 2]

## Preprocess Data

In [3]:
# Remove tweets longer than 280 characters
df = df.drop(df[df.text.str.len() > 280].index).reset_index(drop=True)
test_df = test_df.drop(test_df[test_df.text.str.len() > 280].index).reset_index(drop=True)

# Convert to lowercase
df['text'] = df['text'].str.lower()
test_df['text'] = test_df['text'].str.lower()

# Remove URLs and @mentions
df['text'] = df['text'].apply(lambda t: re.sub('@[^ ]+|http[^ ]+', '', t).strip())
test_df['text'] = test_df['text'].apply(lambda t: re.sub('@[^ ]+|http[^ ]+', '', t).strip())

# Remove stopwords
stop = stopwords.words('english')
df['text'] = df['text'].apply(lambda t: ' '.join([w for w in t.split() if w not in stop]))
test_df['text'] = test_df['text'].apply(lambda t: ' '.join([w for w in t.split() if w not in stop]))

# Remove punctuation
df['text'] = df['text'].apply(lambda t: re.sub('[^a-zA-Z0-9\s]', '', t).strip())
test_df['text'] = test_df['text'].apply(lambda t: re.sub('[^a-zA-Z0-9\s]', '', t).strip())

# Replace label 4 with 1
df['polarity'] = df.polarity.replace(4,1)
test_df['polarity'] = test_df.polarity.replace(4,1)

## Tokenize and Create Train/Test Sets

In [4]:
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(stop_words='english',ngram_range = (1,2),tokenizer = token.tokenize)

In [5]:
X_train=cv.fit_transform(df['text'].values.astype('U'))
y_train=df['polarity']
X_test=cv.transform(test_df['text'].values.astype('U'))
y_test=test_df['polarity']

## Training and Results

In [6]:
cnb = ComplementNB()
cnb.fit(X_train, y_train)             
print ("Train accuracy ={:.2f}%".format(cnb.score(X_train,y_train)*100))
print ("Test accuracy ={:.2f}%".format(cnb.score(X_test,y_test)*100))
train_acc_cnb=cnb.score(X_train,y_train)
test_acc_cnb=cnb.score(X_test,y_test)

Train accuracy =91.09%
Test accuracy =82.17%


In [7]:
y_pred_cnb =cnb.predict(X_test)
print(classification_report(y_test, y_pred_cnb))

              precision    recall  f1-score   support

           0       0.82      0.82      0.82       177
           1       0.83      0.82      0.82       182

    accuracy                           0.82       359
   macro avg       0.82      0.82      0.82       359
weighted avg       0.82      0.82      0.82       359

