In [1]:
import numpy as np
import pandas as pd
import csv
from sklearn.model_selection import train_test_split

In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('state_union')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.tokenize import sent_tokenize, word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package state_union to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package state_union is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
training_documents = []
testing_documents = []

In [4]:
with open('training_twitter_x_y_train.csv',encoding="latin1") as file_obj :
    file_data = csv.DictReader(file_obj, skipinitialspace = True)
    for row in file_data:
        training_documents.append((word_tokenize(row['text']),row['airline_sentiment']))

In [5]:
with open('test_twitter_x_test.csv',encoding="latin1") as file_obj :
    file_data = csv.DictReader(file_obj, skipinitialspace = True)
    for row in file_data:
        testing_documents.append(word_tokenize(row['text']))

In [6]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [7]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    
    if tag.startswith('J') :
        return wordnet.ADJ
    elif tag.startswith('V') :
        return wordnet.VERB
    elif tag.startswith('N') :
        return wordnet.NOUN
    elif tag.startswith('R') :
        return wordnet.ADV
    else :
        return wordnet.NOUN

In [8]:
from nltk.corpus import stopwords
import string
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)

In [9]:
from nltk import pos_tag
def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])
            ## To lemmatize the word, we need to pass the pos, here from tuple we want the first entry
            clean_word = lemmatizer.lemmatize(w , pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [10]:
training_documents = [(clean_review(document),category) for document, category in training_documents]

In [11]:
testing_documents = [clean_review(document) for document in testing_documents]

In [12]:
all_words = []
for doc in training_documents:
    all_words += doc[0]

In [13]:
freq = nltk.FreqDist(all_words) ## gives a frequency distribution object
common = freq.most_common(3000)

features = [i[0] for i in common]

In [14]:
def get_features_dict(words):
    current_features = {}
    words_set = set(words)
    for w in features :
        current_features[w] = w in words_set
    return current_features

In [15]:
training_data = [(get_features_dict(doc),category) for doc, category in training_documents ] ## array of tuple of dict and category

In [16]:
testing_data = [get_features_dict(doc) for doc in testing_documents ]

In [17]:
from nltk import NaiveBayesClassifier

In [18]:
classfier  = NaiveBayesClassifier.train(training_data)

In [19]:
Y_predicted= [classfier.classify(feature_dict) for feature_dict in testing_data]

In [28]:
df = pd.DataFrame(Y_predicted)
df.to_csv('predictions.csv',index=False,header=False)