In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import seaborn as sns
import matplotlib.pyplot as plt
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
tweets = pd.read_csv('/kaggle/input/trainings/narendramodi_tweets.csv')
tweets['created_at'] = pd.to_datetime(tweets['created_at'], format="%Y-%m-%d %H:%M:%S")
tweets.shape

In [None]:
tweets['created_at'].describe()

In [None]:
docs = tweets['text'].str.lower()
docs = docs.str.replace('[^a-z\s#@]', '') # Retain only alphabets, spaces, # and @ symbol. Remove everything else
docs_words = docs.str.split(' ') # Tokenization
words_all = []
for doc in docs_words:
    words_all.extend(doc)
words_freq = pd.Series(words_all).value_counts()
words_freq.head(10)

In [None]:
import nltk

In [None]:
nltk.download('stopwords')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

common_stopwords = nltk.corpus.stopwords.words('english')
custom_stopwords = ['amp', 'rt', '']
all_stopwords = common_stopwords + custom_stopwords
df_words_freq = words_freq.reset_index().rename(columns={'index': 'token', 0: 'freq'})
df_words_freq = df_words_freq[~df_words_freq['token'].isin(all_stopwords)].reset_index(drop=True)
df_words_freq.head(25).plot.barh(x='token', y='freq', figsize=(14,5))

In [None]:
from wordcloud import WordCloud
sample_docs = ['today is yogaday', 'today i wish him a very happy birthday']
sample_docs_str = ' '.join(sample_docs)
#wc = WordCloud(background_color='white', stopwords=all_stopwords).generate(sample_docs_str)
#plt.imshow(wc);
sample_docs_str

In [None]:
docs_string = ' '.join(docs)
wc = WordCloud(background_color='white', stopwords=all_stopwords).generate(docs_string)
plt.figure(figsize=(14,5))
plt.imshow(wc);

### Hashtag analysis

In [None]:
hashtags = df_words_freq[df_words_freq['token'].str.startswith('#')]
hashtags.head(25).plot.barh(x='token', y='freq', figsize=(14,4))

In [None]:
hashtag = '#swachhbharat'
tweets['docs'] = docs
tweets['hashtag'] = tweets['docs'].str.contains(hashtag)
tweets['hashtag'] = tweets['hashtag'].apply(lambda v: 1 if v == True else 0)
tweets['year_month'] = tweets['created_at'].dt.strftime('%Y_%m')
tweets.groupby('year_month')['hashtag'].sum().plot.line();

### Optional Exercises
- Monthwise plot word cloud

### Document Term Matrix

In [None]:
# https://jmcauley.ucsd.edu/data/amazon/
reviews = pd.read_csv('/kaggle/input/trainings/amazon_reviews_big.csv').sample(10000)
reviews['sentiment'] = reviews['overall'].apply(lambda v: 'positive' if v>=3 else 'negative')
print(reviews.shape)
reviews.head()

### Text analytics library in python
- nltk
- gensim
- spacy
- Textblob
- sklearn/keras

In [None]:
#remove_stopwords('this movie is really good')
#from nltk.stem import PorterStemmer

# stemmer = PorterStemmer()
# stemmer.stem('looking'), stemmer.stem('looks'), stemmer.stem('looked')
# stemmer.stem('organization')
# stemmer.stem('president')

In [None]:
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.porter import PorterStemmer

stemmer = PorterStemmer()
docs = reviews['reviewText'].fillna('NA').str.lower().str.replace('[^a-z\s]', '')
docs = docs.apply(remove_stopwords)
docs = stemmer.stem_documents(docs)
docs = pd.Series(docs)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

train_x, test_x, train_y, test_y = train_test_split(docs, reviews['sentiment'],
                                                   test_size=0.2, random_state=1)

vectorizer = CountVectorizer().fit(train_x)

In [None]:
vocab = vectorizer.get_feature_names()
vocab_size = len(vocab)
print('We have %d words across %d documents' % (vocab_size, train_x.shape[0]))

In [None]:
train_dtm = vectorizer.transform(train_x)
test_dtm = vectorizer.transform(test_x)
train_dtm

In [None]:
uncompressed_matrix = train_dtm.toarray()
df_train_dtm = pd.DataFrame(uncompressed_matrix, columns=vocab, index=train_x.index)
df_test_dtm = pd.DataFrame(test_dtm.toarray(), columns=vocab, index=test_x.index)

### Columwise sum (Terms frequency)

In [None]:
top_words = df_train_dtm.sum().sort_values(ascending=False).head(25)

### Rowwise sum (Document size)

In [None]:
# Analysis on document length (i.e. no. of tokens per document)
df_train_dtm.sum(axis=1).sort_values(ascending=False).describe()

### Optional Exercise
- Identify sparsity of the matrix

## Word Similarity

In [None]:
v1 = df_train_dtm['camera'] # vector representation for the word camera
v2 = df_train_dtm['tablet']
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity([v1, v2])[0][1]

In [None]:
v1.shape

### Document Similarity

In [None]:
d1 = df_train_dtm.iloc[0]
print(d1.shape)
d2 = df_train_dtm.iloc[1]
cosine_similarity([d1, d2])[0][1]

In [None]:
train_dtm

In [None]:
# min_df: Remove those terms which has appeared in less number of document
# min_df=5; Retains only those terms which has appeared atleast in five documents
vectorizer = CountVectorizer(min_df=5).fit(train_x)
train_dtm = vectorizer.transform(train_x)
train_dtm

## N- Grams
- Bigrams (two words per token/term)
- Trigrams (three words per token/term)

### Top 5 Bigrams

In [None]:
vectorizer = CountVectorizer(min_df=5, ngram_range=(2,2)).fit(train_x)
train_dtm = vectorizer.transform(train_x)
vocab = vectorizer.get_feature_names()
df_train_dtm = pd.DataFrame(train_dtm.toarray(), columns=vocab, index=train_x.index)
df_train_dtm.sum().sort_values(ascending=False).head(5)

### Top 5 Trigrams

In [None]:
vectorizer = CountVectorizer(min_df=5, ngram_range=(3,3)).fit(train_x)
train_dtm = vectorizer.transform(train_x)
vocab = vectorizer.get_feature_names()
df_train_dtm = pd.DataFrame(train_dtm.toarray(), columns=vocab, index=train_x.index)
df_train_dtm.sum().sort_values(ascending=False).head(5)

In [None]:
vectorizer = CountVectorizer(min_df=5, ngram_range=(1,3)).fit(train_x)
train_dtm = vectorizer.transform(train_x)
vocab = vectorizer.get_feature_names()
df_train_dtm = pd.DataFrame(train_dtm.toarray(), columns=vocab, index=train_x.index)
#df_train_dtm.sum().sort_values(ascending=False).head(50)
#vocab

### Document Clustering
- Clustering algorithms: KMeans, DBSCan
- Topic modeling: LSA, LDA

In [None]:
vectorizer = CountVectorizer(min_df=5, ngram_range=(1,1)).fit(train_x)
train_dtm = vectorizer.transform(train_x)
vocab = vectorizer.get_feature_names()
df_train_dtm = pd.DataFrame(train_dtm.toarray(), columns=vocab, index=train_x.index)
df_train_dtm.head()

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

scaled_dtm = StandardScaler().fit_transform(df_train_dtm)
model = KMeans(n_clusters=5).fit(df_train_dtm)

In [None]:
df = pd.DataFrame({
    'review': train_x,
    'cluster': model.labels_
})
#df['cluster'].value_counts()

In [None]:
cluster_data = df[df['cluster'] == 2]
docs_string = ' '.join(cluster_data['review'])
wc = WordCloud(background_color='white').generate(docs_string)
plt.figure(figsize=(14,5))
plt.imshow(wc)