## In this kernel I will try to evaluate some most popular ML classifiers for sentiment classification task.

First importing libraries and loading dataset.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import re
import nltk
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("../input/Tweets.csv")
df.head()

First of all lets find out if data types are correct.

In [None]:
df.dtypes

Seems everything is fine. Lets look at NaN values.

In [None]:
df.isnull().sum()

Not to deal with missing values I will exclude those columns with NaN values and some other fields that I will not use from further analysis.

In [None]:
df = df.drop(['negativereason', 
              'negativereason_confidence', 
              'airline_sentiment_gold', 
              'negativereason_gold', 
              'tweet_coord', 
              'tweet_created', 
              'tweet_location', 
              'user_timezone', 
              'tweet_id',
              'name',
              'airline_sentiment_confidence',
              'retweet_count'], axis=1)

df.head()

Now lets look at sentiment type counts.

In [None]:
df['airline_sentiment'].value_counts().plot(kind='bar')

It seems that dataset is unbalanced with way to much of negative sentiments.

I will convert airline column data to binary categorical and will use TF-IDF for text column. 

In [None]:
airline_categorical = pd.get_dummies(df['airline'])
# df = df.drop(['airline'], axis=1)

In [None]:
df = pd.concat([df, airline_categorical], axis=1)
df.head()

Before perform TF-IDF feature extraction from column text, I need to perform some pre-processing like removing stop words and other separators.

In [None]:
stop_words = set(stopwords.words('english'))

df['text'] = df['text'].apply(lambda x: re.sub('[^a-z]', ' ', x.lower()))
df['text'] = df['text'].apply(lambda x: re.sub(' +', ' ', x))
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords.words('english')]))

df.head()

Convert sentiment column values to int type.

In [None]:
df['target'] = df['airline_sentiment'].apply(lambda x: 0 if x == 'negative' else 1 if x == 'neutral' else 2)
df = df.drop(['airline_sentiment'], axis=1)
df.head()

Droping duplicates if exists.

In [None]:
df = df.drop_duplicates()
df = df.reset_index(drop=True)
df.describe(include='all')

Performing train/test split.

In [None]:
df_train, df_test = train_test_split(df, test_size=0.3)
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
print(df_train.shape, df_test.shape)
df_train.head()

Performing text transformation to features (TF-IDF).

In [None]:
vectorizer = TfidfVectorizer()
text_features_train = vectorizer.fit_transform(df_train['text'])
text_features_train.shape

Adding categorical airline data to generated text features.

In [None]:
features_train = np.concatenate([text_features_train.toarray(), df_train[['American', 
                                                                          'Delta', 
                                                                          'Southwest', 
                                                                          'US Airways', 
                                                                          'United', 
                                                                          'Virgin America']].values], axis=1)
features_train.shape

To reduce dimensionality I will try to use PCA.

In [None]:
pca = PCA(n_components=2)
features_train = pca.fit_transform(features_train)
features_train.shape

In [None]:
df_features_train = pd.DataFrame(features_train)
df_features_train = pd.concat([df_features_train, df_train[['target']]], axis=1, ignore_index=True)
df_features_train.columns = ['pca_1', 'pca_2', 'target']
df_features_train.describe(include='all')

Lets plot scatter plot and look how data are distributed.

In [None]:
cmap = {0: 'red', 1: 'blue', 2: 'green'}
df_features_train.plot(kind='scatter', x='pca_1', y='pca_2', c=[cmap.get(t, 'black') for t in df_features_train['target']])

Lets define classifiers that will be evaluated for this task.

In [None]:
Classifiers = [
    KNeighborsClassifier(3),
    KNeighborsClassifier(5),
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=200),
    AdaBoostClassifier(),
    GaussianNB()]

Performing models training and evaluating.

In [None]:
text_features_test = vectorizer.transform(df_test['text'])
features_test = np.concatenate([text_features_test.toarray(), df_test[['American', 
                                                                       'Delta', 
                                                                       'Southwest', 
                                                                       'US Airways', 
                                                                       'United', 
                                                                       'Virgin America']].values], axis=1)
features_test = pca.transform(features_test)
df_features_test = pd.DataFrame(features_test)
df_features_test = pd.concat([df_features_test, df_test[['target']]], axis=1, ignore_index=True)
df_features_test.columns = ['pca_1', 'pca_2', 'target']
df_features_test.shape

In [None]:
for c in Classifiers:
    fit = c.fit(df_features_train[['pca_1', 'pca_2']], df_features_train[['target']])
    pred = fit.predict(df_features_test[['pca_1', 'pca_2']])

    accuracy = accuracy_score(pred, df_features_test[['target']])

    print('Accuracy of ' + c.__class__.__name__ + 'is ' + str(accuracy))  

Results are not good, lets try using not binary categorical airline features but  int.

In [None]:
df_train['airline'] = df_train['airline'].apply(lambda x: 1 if x == 'American' else 2 if x == 'Delta' else 3 if x =='Southwest' else 4 if x == 'US Airways' else 5 if x == 'United' else 6 if x == 'Virgin America' else 0)
df_test['airline'] = df_test['airline'].apply(lambda x: 1 if x == 'American' else 2 if x == 'Delta' else 3 if x =='Southwest' else 4 if x == 'US Airways' else 5 if x == 'United' else 6 if x == 'Virgin America' else 0)

vectorizer = TfidfVectorizer()
text_features_train = vectorizer.fit_transform(df_train['text'])

features_train = np.concatenate([text_features_train.toarray(), df_train[['airline']].values], axis=1)

pca = PCA(n_components=2)
features_train = pca.fit_transform(features_train)

df_features_train = pd.DataFrame(features_train)
df_features_train = pd.concat([df_features_train, df_train[['target']]], axis=1, ignore_index=True)
df_features_train.columns = ['pca_1', 'pca_2', 'target']
df_features_train.describe(include='all')

text_features_test = vectorizer.transform(df_test['text'])
features_test = np.concatenate([text_features_test.toarray(), df_test[[ 'airline']].values], axis=1)

features_test = pca.transform(features_test)
df_features_test = pd.DataFrame(features_test)
df_features_test = pd.concat([df_features_test, df_test[['target']]], axis=1, ignore_index=True)
df_features_test.columns = ['pca_1', 'pca_2', 'target']

In [None]:
for c in Classifiers:
    fit = c.fit(df_features_train[['pca_1', 'pca_2']], df_features_train[['target']])
    pred = fit.predict(df_features_test[['pca_1', 'pca_2']])

    accuracy = accuracy_score(pred, df_features_test[['target']])

    print('Accuracy of ' + c.__class__.__name__ + 'is ' + str(accuracy)) 

As we can see accuracy is not so high. Need to spend more time on feature enginearing and classifiers hyperparameter tunning in order to increase accuracy.