In [3]:
path='../input/dataset/'

In [4]:
import pandas as pd
import numpy as np

In [5]:
train=pd.read_csv(path+"train.csv")
test=pd.read_csv(path+"test.csv")

In [6]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [7]:
train.head()

In [8]:
lst = train['comment_text'].tolist()
lst[0]

In [9]:
import re
from bs4 import BeautifulSoup
def clean(comment):
  comment = comment.lower() #lowercasing
  comment = re.sub(r"http\S+", "", comment) #removing URLs
  soup = BeautifulSoup(comment, 'lxml') #removing tags
  comment = soup.get_text()
  comment = comment.strip()
  comment = comment.split()
  comment = ' '.join(comment)
  return comment

In [10]:
lst1 = []
for i in range(len(lst)):
    lst1.append(clean(lst[i]))

In [11]:
lst1[0]

In [12]:
#Expandig English language contractions
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [13]:
for i in range(len(lst1)):
    lst1[i] = decontracted(lst1[i])
    lst1[i] = re.sub("\S*\d\S*", "", lst1[i]).strip() #Remove words with numbers
    lst1[i] = re.sub('[^A-Za-z0-9]+', ' ', lst1[i]) #Remove special characters

In [14]:
lst1[0]

In [15]:
stopwords= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"])

In [16]:
comments = []
for i in range(len(lst1)):
    lst1[i] = ' '.join(e.lower() for e in lst1[i].split() if e.lower() not in stopwords)
    comments.append(lst1[i].strip())

In [17]:
len(comments)

In [18]:
list_of_sentance = []
for ele in comments:
  list_of_sentance.append(ele.split())

In [19]:
want_to_train_w2v = True

if want_to_train_w2v:
    # min_count = 5 considers only words that occured atleast 2 times
    w2v_model=Word2Vec(list_of_sentance,min_count=2,vector_size=100, workers=8)
    print(w2v_model.wv.most_similar('great'))
    print('='*50)
    print(w2v_model.wv.most_similar('worst'))

In [20]:
w2v_words = list(w2v_model.wv.key_to_index)
print("number of words that occured minimum 5 times ",len(w2v_words))
print("sample words ", w2v_words[0:50])

In [21]:
model = TfidfVectorizer()
tf_idf_matrix = model.fit_transform(comments)
# we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(model.get_feature_names(), list(model.idf_)))

In [22]:
from tqdm import tqdm
# TF-IDF weighted Word2Vec
tfidf_feat = model.get_feature_names() # tfidf words/col-names
# final_tf_idf is the sparse matrix with row= sentence, col=word and cell_val = tfidf

tfidf_sent_vectors = []; # the tfidf-w2v for each sentence is stored in this list
row=0;
for sent in tqdm(list_of_sentance): # for each sentence 
    sent_vec = np.zeros(100) # as word vectors are of zero length
    weight_sum =0; # num of words with a valid vector in the sentence
    for word in sent: # for each word in a sentence
        if word in w2v_words and word in tfidf_feat:
            vec = w2v_model.wv[word]
            #  tf_idf = tf_idf_matrix[row, tfidf_feat.index(word)]
            # to reduce the computation we are 
            # dictionary[word] = idf value of word in whole courpus
            # sent.count(word) = tf valeus of word in this review
            tf_idf = dictionary[word]*(sent.count(word)/len(sent))
            sent_vec += (vec * tf_idf)
            weight_sum += tf_idf
    if weight_sum != 0:
        sent_vec /= weight_sum
    tfidf_sent_vectors.append(sent_vec)
    row += 1

In [23]:
len(tfidf_sent_vectors)

In [29]:
tfidf_sent_vectors[2]

In [25]:
type(tfidf_sent_vectors)

In [26]:
train['vector'] = tfidf_sent_vectors

In [27]:
train.head()

In [30]:
train.to_csv("preprocessed_data.csv")