In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
from nltk import pos_tag
import warnings
warnings.filterwarnings("ignore")
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import tree

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/prishashah/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


<b>Preparing Training Data</b>

<b>Importing Training Data</b>

In [2]:
df_train=pd.read_csv('train.csv')
df_train = df_train[['text', 'airline_sentiment']]

In [3]:
training_data = df_train.values

<b> Splitting the Tweet text into words using NLTK </b>

In [4]:
tweets_train = []
for i in range(len(training_data)):
    tweets_train.append([word_tokenize(training_data[i][0]), training_data[i][1]])  

<b>Cleaning the Words using WordNetLemmatizer available in NLTK</b>

In [5]:
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)

In [6]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [7]:
lemmatizer = WordNetLemmatizer()
def clean_tweets(words):
    output_words = []
    for w in words:
        if w.isalpha():
            if w.lower() not in stops:
                pos = pos_tag([w])
                clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
                output_words.append(clean_word.lower())
    return output_words

In [8]:
for i in range(len(tweets_train)):
    tweets_train[i] = (clean_tweets(tweets_train[i][0]), tweets_train[i][1])

In [9]:
y_train = []
tweets = []
for tweet, sentiment in tweets_train:
    tweets.append(" ".join(tweet))
    y_train.append(sentiment)

<b>Using Count Vectorizer to get the X Train</b>

In [10]:
count_vec = CountVectorizer(max_features=2000) # Tried using n grams but the accuracy was decreasing
x_train_features = count_vec.fit_transform(tweets)

<b>Preparing Testing Data</b>

In [11]:
df_test = pd.read_csv('test.csv')
testing_data = np.array(df_test['text'])

In [12]:
tweets_test = []
for t in testing_data:
    t = clean_tweets(word_tokenize(t))
    tweets_test.append(" ".join(t))

In [13]:
x_test_features = count_vec.transform(tweets_test)

<b>Performing Classification</b>

<b>Support Vector Machine</b>

In [14]:
svc = SVC()
svc.fit(x_train_features, y_train)

SVC()

In [15]:
y_pred_svm = svc.predict(x_test_features)

<b>Random Forest</b>

In [16]:
rf = RandomForestClassifier()
rf.fit(x_train_features, y_train)

RandomForestClassifier()

In [17]:
y_pred_rf = rf.predict(x_test_features)

In [18]:
df = pd.DataFrame(y_pred_rf)
df.to_csv('predictions_rf.csv', index = False, header = False)

<b>Multinomial Naive Bayes</b>

In [19]:
mnv = MultinomialNB(alpha = 1)
mnv.fit(x_train_features, y_train)

MultinomialNB(alpha=1)

In [20]:
y_pred_mnv = mnv.predict(x_test_features)

In [21]:
df = pd.DataFrame(y_pred_mnv)
df.to_csv('predictions_mnv.csv', index = False, header = False)

<b>Descision Tree</b>

In [22]:
dt = tree.DecisionTreeClassifier()
dt.fit(x_train_features, y_train)

DecisionTreeClassifier()

In [23]:
y_pred_dt = dt.predict(x_test_features)

In [26]:
df = pd.DataFrame(y_pred_dt)
df.to_csv('predictions_dt.csv', index = False, header = False)