## Reading Libraries 

In [None]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline 
import nltk
from nltk.tokenize import MWETokenizer
import pandas as pd
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('punkt')

In [None]:
lab_data = pd.read_csv('labeled_data.csv')
#unlabeled_data = pd.read_csv('unlabeled_data.csv')

In [None]:
#lab_data = pd.read_csv('./Training Dataset-20191010/labeled_data.csv')
#unlabeled_data = pd.read_csv('./Training Dataset-20191010/unlabeled_data.csv')

In [None]:
lab_data.head()

In [None]:
lab_data['text'][1].strip()

## Case Normalisation, Tokenization and Stop words removal

In [None]:
import nltk
nltk.download('wordnet')

In [None]:
stopwords = []
with open('./stopwords_en.txt') as f:
    stopwords = f.read().splitlines()

In [None]:
def lemmatization(token_list):
    lemmatizer = WordNetLemmatizer()
    lem_token = []
    for each in token_list :
        print(each ,":", lemmatizer.lemmatize(each)) 
        lem_token.append(lemmatizer.lemmatize(each))
    return lem_token

In [None]:
def token(raw_data):
    raw_data1 = raw_data.lower()
    tokenised = nltk.tokenize.word_tokenize(raw_data1) 
    lem_token = lemmatization(tokenised)
    stopwords_tokens = [w for w in lem_token if not w in stopwords]
    processed_data = ' '.join(stopwords_tokens) 
   # processed_data = ' '.join(lem_tokens)
    return(processed_data)

In [None]:
lab_data['text'] = lab_data.apply(lambda row: token(row['text'].strip()), axis=1)

In [None]:
lab_data.head()

In [None]:
train = lab_data['text'].tolist()

## TFIDF + Logistic Regression

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer 


vect = TfidfVectorizer(analyzer='word', input='content', ngram_range=(1,2))
    
train_review = vect.fit_transform(train)

In [None]:
type(train_review)

In [None]:
# import numpy as np
# vocab = vect.get_feature_names()

# dist = np.sum(train_review.toarray(), axis=0)

# for tag, count in zip(vocab, dist):
#     print(count, tag)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_review, lab_data['label'],test_size=0.10)

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=1, solver='liblinear', multi_class='ovr').fit(X_train, y_train)
pred=model.predict(X_test)

In [None]:
model.score(X_test, y_test)

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, pred)

In [None]:
import seaborn as sn
plt.figure(figsize=(10,7))
sn.heatmap(cm, annot=True)
plt.xlabel("predicted")
plt.ylabel("truth")

## Word2vec + Logistic

In [None]:
sentences = []
for review in lab_data['text']:
    sentences.append(review.split(' '))

In [None]:
len(lab_data)
?TfidfVectorizer

In [None]:
# Import the built-in logging module and configure it so that Word2Vec 
# creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 20   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

In [None]:
model.doesnt_match("man woman child kitchen".split())

In [None]:
model.wv.syn0.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(lab_data.text, lab_data['label'],test_size=0.30)

In [None]:
def get_sentences(data):
    sentences = []
    for review in data:
        sentences.append(review.split(' '))
    return sentences

In [None]:
import numpy as np  # Make sure that numpy is imported

# Function to average all word vectors in a paragraph
def featureVecMethod(words, model, num_features):
    # Pre-initialising empty numpy array for speed
    featureVec = np.zeros(num_features,dtype="float32")
    nwords = 0
    
    #Converting Index2Word which is a list to a set for better speed in the execution.
    index2word_set = set(model.wv.index2word)
    
    for word in  words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec,model[word])
    
    # Dividing the result by number of words to get average
    featureVec = np.divide(featureVec, nwords)
    return featureVec

# Function for calculating the average feature vector
def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:
        # Printing a status message every 1000th review
        if counter%1000 == 0:
            print("Review %d of %d"%(counter,len(reviews)))
            
        reviewFeatureVecs[counter] = featureVecMethod(review, model, num_features)
        counter = counter+1
        
    return reviewFeatureVecs

In [None]:
traindataVecs = getAvgFeatureVecs(sentences, model, num_features )

In [None]:
testdataVecs = getAvgFeatureVecs(get_sentences(X_test), model, num_features )

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100)
    
print("Fitting random forest to training data....")    
forest = forest.fit(traindataVecs, lab_data['label'])