# Importing Libraries

In [136]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords,wordnet
import string
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

# Importing Dataset

In [137]:
dataset=pd.read_csv('Dataset/airline data.csv')
dataset.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [None]:
len(dataset.columns)

In [138]:
text=dataset['text'].values
text=[word_tokenize(doc) for doc in text]

In [139]:
text[0]

['@', 'VirginAmerica', 'What', '@', 'dhepburn', 'said', '.']

In [140]:
sentiment=dataset['airline_sentiment'].values #'neutral' 'positive' 'negative'

In [141]:
for col in ['airline_sentiment']:
    print(col,dataset[col].unique())

airline_sentiment ['neutral' 'positive' 'negative']


In [178]:
documents=[(text[i],sentiment[i]) for i in range(dataset.shape[0])]

In [177]:
dataset.shape[0]

14640

In [146]:
documents[0]

(['@', 'VirginAmerica', 'What', '@', 'dhepburn', 'said', '.'], 'neutral')

# Dataset Preprocessing
1. TOKENIZING
2. LEMMATIZING
3. REMOVING STOPWORDS
4. PUNCTUATIONS

In [147]:
lemmatizer=WordNetLemmatizer()

In [148]:
def get_simple_pos(tag):   
    
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [149]:
stops=set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)
stops

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'need

In [150]:
def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [151]:
document = [(clean_review(doc), category) for doc, category in documents]

In [162]:
document[0]

(['virginamerica', 'dhepburn', 'say'], 'neutral')

In [163]:
categories = [category for document, category in documents]

In [164]:
categories[:10]

['neutral',
 'positive',
 'neutral',
 'negative',
 'negative',
 'negative',
 'positive',
 'neutral',
 'positive',
 'positive']

In [172]:
text_documents = [" ".join(document) for document, category in documents]

In [176]:
text_documents[0]

'@ VirginAmerica What @ dhepburn said .'