Given Twitter US Airline Sentiment Dataset, which contains data for over 14000 tweets.
Your task is to predict the sentiment of the tweet i.e. positive, negative or neutral.

### Imports Needed

In [1]:
# Data Manipulation
import numpy as np
import string

# stopwords and methods for preprocessing text
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag 
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# sklearn classifiers and methods
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

### Read train and test data

In [31]:
def load_data():
    df_train = pd.read_csv('train.csv', encoding='utf-8')
    df_test = pd.read_csv('test.csv', encoding='utf-8')
    return df_train, df_test

In [32]:
# Call load_data
df_train ,df_test = load_data()

In [33]:
df_train.head()

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,567900433542488064,negative,Southwest,,ColeyGirouard,,0,"@SouthwestAir I am scheduled for the morning, ...",,2015-02-17 20:16:29 -0800,Washington D.C.,Atlantic Time (Canada)
1,569989168903819264,positive,Southwest,,WalterFaddoul,,0,@SouthwestAir seeing your workers time in and ...,,2015-02-23 14:36:22 -0800,"Indianapolis, Indiana; USA",Central Time (US & Canada)
2,568089179520954368,positive,United,,LocalKyle,,0,@united Flew ORD to Miami and back and had gr...,,2015-02-18 08:46:29 -0800,Illinois,Central Time (US & Canada)
3,568928195581513728,negative,Southwest,,amccarthy19,,0,@SouthwestAir @dultch97 that's horse radish 😤🐴,,2015-02-20 16:20:26 -0800,,Atlantic Time (Canada)
4,568594180014014464,negative,United,,J_Okayy,,0,@united so our flight into ORD was delayed bec...,,2015-02-19 18:13:11 -0800,,Eastern Time (US & Canada)


In [34]:
df_test.head()

Unnamed: 0,tweet_id,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,569682010270101504,American,,zsalim03,,0,@AmericanAir In car gng to DFW. Pulled over 1h...,,2015-02-22 18:15:50 -0800,Texas,Central Time (US & Canada)
1,569608307184242688,American,,sa_craig,,0,"@AmericanAir after all, the plane didn’t land ...",,2015-02-22 13:22:57 -0800,"College Station, TX",Central Time (US & Canada)
2,567879304593408001,Southwest,,DanaChristos,,1,@SouthwestAir can't believe how many paying cu...,,2015-02-17 18:52:31 -0800,CT,Eastern Time (US & Canada)
3,569757651539660801,US Airways,,rossj987,,0,@USAirways I can legitimately say that I would...,,2015-02-22 23:16:24 -0800,"Washington, D.C.",Eastern Time (US & Canada)
4,569900705852608513,American,,tranpham18,,0,@AmericanAir still no response from AA. great ...,,2015-02-23 08:44:51 -0800,New York City,Eastern Time (US & Canada)


In [36]:
df_train.shape

(10980, 12)

In [48]:
df_test.shape

(3660, 11)

### Return list of all words of text

In [66]:
def get_words_list(tweet):
    words_list = list(tweet.split())
    return words_list

### Build training and testing documents

In [68]:
def build_train_and_test_documents():
    training_documents = []
    testing_documents = []

    # Build training _documents
    for tweet in df_train['text']:
        training_documents.append(get_words_list(tweet))

    # Build testing _documents
    for tweet in df_test['text']:
        testing_documents.append(get_words_list(tweet))
        
    return training_documents, testing_documents

In [69]:
# Calling build_train_and_test_documents
train_documents, test_documents = build_train_and_test_documents()

In [70]:
print(len(train_documents))
train_documents[0]

10980


['@SouthwestAir',
 'I',
 'am',
 'scheduled',
 'for',
 'the',
 'morning,',
 '2',
 'days',
 'after',
 'the',
 'fact,',
 'yes..not',
 'sure',
 'why',
 'my',
 'evening',
 'flight',
 'was',
 'the',
 'only',
 'one',
 'Cancelled',
 'Flightled']

In [71]:
print(len(test_documents))
test_documents[0]

3660


['@AmericanAir',
 'In',
 'car',
 'gng',
 'to',
 'DFW.',
 'Pulled',
 'over',
 '1hr',
 'ago',
 '-',
 'very',
 'icy',
 'roads.',
 'On-hold',
 'with',
 'AA',
 'since',
 '1hr.',
 "Can't",
 'reach',
 'arpt',
 'for',
 'AA2450.',
 'Wat',
 '2',
 'do?']