In [35]:
import numpy as np
import pandas as pd 
import re
from sklearn.model_selection import train_test_split 

import nltk
from nltk.corpus import stopwords
from nltk.classify import SklearnClassifier
from nltk.stem.snowball import SnowballStemmer
from wordcloud import WordCloud,STOPWORDS
import matplotlib.pyplot as plt
from collections import defaultdict 
import spacy 
from gensim.models.phrases import Phrases, Phraser

In [2]:
# Reading the data
data = pd.read_csv('twcs.csv')
data.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,1,sprintcare,False,Tue Oct 31 22:10:47 +0000 2017,@115712 I understand. I would like to assist y...,2.0,3.0
1,2,115712,True,Tue Oct 31 22:11:45 +0000 2017,@sprintcare and how do you propose we do that,,1.0
2,3,115712,True,Tue Oct 31 22:08:27 +0000 2017,@sprintcare I have sent several private messag...,1.0,4.0
3,4,sprintcare,False,Tue Oct 31 21:54:49 +0000 2017,@115712 Please send us a Private Message so th...,3.0,5.0
4,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,@sprintcare I did.,4.0,6.0


In [7]:
# Let's pre-process the data
# Downloading NLTK's stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\punee\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [17]:
# Getting stemmer and stop words
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

In [24]:
# Defining regex
regex = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

In [27]:
# Remove link,user and special characters
def preprocess(text, stem=False):
    text = re.sub(regex, ' ', str(text).lower()).strip() # lower case and striping and replacing regex with ' '
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [28]:
data.text=data.text.apply(lambda x: preprocess(x))
data.text.head(10)

0    understand would like assist would need get pr...
1                                              propose
2    sent several private messages one responding u...
3    please send us private message assist click me...
4                                                     
5    please send us private message gain details ac...
6                               worst customer service
7            saddening hear please shoot us dm look kc
8     gonna magically change connectivity whole family
9    understand concerns like please send us direct...
Name: text, dtype: object

In [31]:
data['text'].isnull().sum()
# We do not have any null text in our column text

0

In [36]:
sent = [row.split() for row in data['text']]
# Phrases() takes a list of words as input

In [37]:
# PhrasescAutomatically detect common phrases – aka multi-word expressions
phrases = Phrases(sent, min_count=20, progress_per=10000)

In [39]:
bigram = Phraser(phrases)
# Converting data to Bi-Grams

In [42]:
sentences = bigram[sent]

In [45]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)


397761

In [46]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]
# These are most frequent words in our corpus.

['us', 'please', 'dm', 'help', 'hi', 'get', 'thanks', '2', 'know', 'let']

In [47]:
import multiprocessing
from gensim.models import Word2Vec

In [54]:
# Count the number of cores in my computer
cores = multiprocessing.cpu_count() 
cores

12

In [50]:
# Training a W2V Model
w2v_model = Word2Vec(min_count=15,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [52]:
# Building our W2V vocab
from time import time 
t = time()
w2v_model.build_vocab(sentences, progress_per=10000)
print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocab: 2.55 mins


In [55]:
words = w2v_model.wv.vocab.keys()
vocab_size = len(words)
print("Vocab size", vocab_size)
# This is our vocab size

Vocab size 51487


In [56]:
start = time()
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=10, report_delay=1)
print('Time to train the model: {} mins'.format(round((time() - start) / 60, 2)))

# We ran just 10 epochs right now due to time consumption 
# We can train for higher epochs later

Time to train the model: 33.18 mins


In [57]:
w2v_model.wv.most_similar(positive=["disregarded"])
# Just an example to show similar words of disregarded and their corresponding scores.

[('ignored', 0.4445369839668274),
 ('unsympathetic', 0.4098834991455078),
 ('ignoring', 0.39682185649871826),
 ('ignore', 0.38104724884033203),
 ('dismissed', 0.3748735189437866),
 ('negligent', 0.37408536672592163),
 ('dismissive', 0.3720099627971649),
 ('unapologetic', 0.3616518974304199),
 ('ignores', 0.34229981899261475),
 ('inadequate', 0.33695346117019653)]

In [58]:
# Saving model for later use
w2v_model.save("word2vec.model")

In [64]:
data['text'].to_csv('clean_tweets.csv', index=False)