## Importing the Required Libraries

In [1]:
!pip install pandas
import pandas as pd
import numpy as np
!pip install nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
from nltk import pos_tag
import warnings
warnings.filterwarnings("ignore")
!pip install sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import tree



## Preparing Training Data

### Importing Training Data

In [2]:
df_train = pd.read_csv('train.csv')

In [3]:
print(df_train)

                 tweet_id airline_sentiment     airline  \
0      567900433542488064          negative   Southwest   
1      569989168903819264          positive   Southwest   
2      568089179520954368          positive      United   
3      568928195581513728          negative   Southwest   
4      568594180014014464          negative      United   
...                   ...               ...         ...   
10975  569934458364813313           neutral    American   
10976  568564006329434113          positive      United   
10977  569643648910028801          negative  US Airways   
10978  568864981917110272          negative  US Airways   
10979  568929299350179840          negative      United   

      airline_sentiment_gold           name negativereason_gold  \
0                        NaN  ColeyGirouard                 NaN   
1                        NaN  WalterFaddoul                 NaN   
2                        NaN      LocalKyle                 NaN   
3                      

In [4]:
df_train = df_train[['text', 'airline_sentiment']]
print(df_train)

                                                    text airline_sentiment
0      @SouthwestAir I am scheduled for the morning, ...          negative
1      @SouthwestAir seeing your workers time in and ...          positive
2      @united Flew ORD to Miami and back and  had gr...          positive
3         @SouthwestAir @dultch97 that's horse radish 😤🐴          negative
4      @united so our flight into ORD was delayed bec...          negative
...                                                  ...               ...
10975                            @AmericanAir followback           neutral
10976  @united thanks for the help. Wish the phone re...          positive
10977  @usairways the. Worst. Ever. #dca #customerser...          negative
10978  @nrhodes85: look! Another apology. DO NOT FLY ...          negative
10979  @united you are by far the worst airline. 4 pl...          negative

[10980 rows x 2 columns]


In [5]:
training_data = df_train.values
print(training_data)

[['@SouthwestAir I am scheduled for the morning, 2 days after the fact, yes..not sure why my evening flight was the only one Cancelled Flightled'
  'negative']
 ['@SouthwestAir seeing your workers time in and time out going above and beyond is why I love flying with you guys. Thank you!'
  'positive']
 ['@united Flew ORD to Miami and back and  had great crew, service on both legs. THANKS'
  'positive']
 ...
 ['@usairways the. Worst. Ever. #dca #customerservice' 'negative']
 ['@nrhodes85: look! Another apology. DO NOT FLY @USAirways' 'negative']
 ['@united you are by far the worst airline. 4 plane delays on 1 round trip flight. How is that possible.'
  'negative']]


### Spliiting the Tweet text into words using NLTK 

In [6]:
tweets_train = []
for i in range(len(training_data)):
    tweets_train.append([word_tokenize(training_data[i][0]), training_data[i][1]])  

### Cleaning the Words using WordNetLemmatizer available in NLTK

In [7]:
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)

In [8]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [9]:
lemmatizer = WordNetLemmatizer()
def clean_tweets(words):
    output_words = []
    for w in words:
        if w.isalpha():
            if w.lower() not in stops:
                pos = pos_tag([w])
                #print(pos)
                clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
                output_words.append(clean_word.lower())
    return output_words

In [10]:
for i in range(len(tweets_train)):
    tweets_train[i] = (clean_tweets(tweets_train[i][0]), tweets_train[i][1])
#print(tweets_train)

In [11]:
y_train = []
tweets = []
for tweet, sentiment in tweets_train:
    tweets.append(" ".join(tweet))
    y_train.append(sentiment)

### Using Count Vectorizer to get the X Train

In [12]:
#https://www.geeksforgeeks.org/using-countvectorizer-to-extracting-features-from-text/
count_vec = CountVectorizer(max_features=2000) # Tried using n grams but the accuracy was decreasing
x_train_features = count_vec.fit_transform(tweets)

## Prepaing Testing Data

In [13]:
df_test = pd.read_csv('test.csv')

In [14]:
testing_data = np.array(df_test['text'])

In [15]:
tweets_test = []
for t in testing_data:
    t = clean_tweets(word_tokenize(t))
    tweets_test.append(" ".join(t))

In [16]:
x_test_features = count_vec.transform(tweets_test)

## Performing Classification

### Support Vector Machine

In [17]:
svc = SVC()
svc.fit(x_train_features, y_train)

SVC()

In [18]:
y_pred_svm = svc.predict(x_test_features)

In [19]:
df = pd.DataFrame(y_pred_svm)
df.to_csv('predictions_svm.csv', index = False, header = False)

### Random Forest

In [20]:
rf = RandomForestClassifier()
rf.fit(x_train_features, y_train)

RandomForestClassifier()

In [21]:
y_pred_rf = rf.predict(x_test_features)

In [22]:
df = pd.DataFrame(y_pred_rf)
df.to_csv('predictions_rf.csv', index = False, header = False)

### Multinomial Naive Bayes

In [23]:
mnv = MultinomialNB(alpha = 1)
mnv.fit(x_train_features, y_train)

MultinomialNB(alpha=1)

In [24]:
y_pred_mnv = mnv.predict(x_test_features)

In [25]:
df = pd.DataFrame(y_pred_mnv)
df.to_csv('predictions_mnv.csv', index = False, header = False)

### Descision Tree

In [26]:
dt = tree.DecisionTreeClassifier()
dt.fit(x_train_features, y_train)

DecisionTreeClassifier()

In [27]:
y_pred_dt = dt.predict(x_test_features)

In [28]:
df = pd.DataFrame(y_pred_dt)
df.to_csv('predictions_dt.csv', index = False, header = False)

##### It was found that Multinomial Naive Bayes was performing the best among the above classifiers.