In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
tweets = pd.read_csv('/kaggle/input/trainings/narendramodi_tweets.csv')
tweets.shape

In [None]:
tweets.head()

In [None]:
docs = tweets['text']
docs.head()

### Extract tokens

In [None]:
docs = tweets['text'].str.lower().str.replace('[^a-z\s#@]', '') # remove everything other than alphabets, spaces, # , @
docs_tokens = docs.str.split(' ')

tokens_all = []
for tokens in docs_tokens:
    tokens_all.extend(tokens)
print('No. of tokens in entire corpus:', len(tokens_all))


## Bag of words analysis

In [None]:
tokens_freq = pd.Series(tokens_all).value_counts().drop([''])
tokens_freq

In [None]:
import nltk # natural language tool kit
nltk.download('stopwords')

In [None]:
common_stopwords = nltk.corpus.stopwords.words('english')
custom_stopwords = ['amp', 'rt']
all_stopwords = np.hstack([common_stopwords, custom_stopwords])
len(all_stopwords)

In [None]:
df_tokens = pd.DataFrame(tokens_freq).reset_index().rename(columns={'index': 'token', 0: 'frequency'})
df_tokens = df_tokens[~df_tokens['token'].isin(all_stopwords)]
import matplotlib.pyplot as plt
plt.figure(figsize=(14,5))
df_tokens.set_index('token')['frequency'].head(25).plot.bar()

## Wordclouds

In [None]:
doc1 = 'i love india'
doc2 = 'i love cricket'
from wordcloud import WordCloud

docs_string = ' '.join([doc1, doc2])
print(docs_string)
wc = WordCloud(background_color='white').generate(docs_string)
plt.imshow(wc)

In [None]:
docs_strings = ' '.join(docs)
len(docs_strings)
wc = WordCloud(background_color='white', stopwords=all_stopwords).generate(docs_strings)
plt.figure(figsize=(20,5))
plt.imshow(wc)
plt.axis('off');

## Hashtag analysis

In [None]:
hashtags = df_tokens[df_tokens['token'].str.startswith('#')]
plt.figure(figsize=(20,5))
hashtags.set_index('token')['frequency'].head(25).plot.bar()

In [None]:
tweets['created_at'] = pd.to_datetime(tweets['created_at'], format="%Y-%m-%d %H:%M:%S")
tweets['year_month'] = tweets['created_at'].dt.strftime('%Y-%m')
tweets['created_at'].describe()

In [None]:
hashtag = 'digital'
tweets[hashtag] = tweets['text'].str.lower().str.contains(hashtag)
hashtag_month_count = tweets.groupby(['year_month'])[hashtag].sum()
hashtag_month_count.plot.line();

In [None]:
for year_month in tweets['year_month'].unique():
    sub_data = tweets[tweets['year_month'] == year_month]
    docs = sub_data['text'].str.lower().str.replace('[^a-z\s#@]', '')
    docs_strings = ' '.join(docs)
    len(docs_strings)
    wc = WordCloud(background_color='white', stopwords=all_stopwords).generate(docs_strings)
    plt.figure(figsize=(20,5))
    plt.imshow(wc)
    plt.title(year_month)
    plt.axis('off');    

## Vectorization

In [None]:
imdb = pd.read_csv('/kaggle/input/trainings/IMDB Dataset.csv').sample(1000)
print(imdb.shape)
imdb.head()

In [None]:
docs = imdb['review'].str.lower().str.replace('[^a-z\s]', '')

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

train_docs, test_docs = train_test_split(docs, test_size=0.2, random_state=1)

stopwords = nltk.corpus.stopwords.words('english')
stopwords.remove('not')
vectorizer = CountVectorizer(stop_words=stopwords, min_df=10).fit(train_docs)
vocab = vectorizer.get_feature_names()

In [None]:
train_dtm = vectorizer.transform(train_docs)
test_dtm = vectorizer.transform(test_docs)

In [None]:
df_train_dtm = pd.DataFrame(train_dtm.toarray(), index=train_docs.index, columns=vocab)
df_test_dtm = pd.DataFrame(test_dtm.toarray(), index=test_docs.index, columns=vocab)

## Stemming

In [None]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
for word in ['looking', 'looks', 'looked']:
    print(stemmer.stem(word))

In [None]:
stemmer.stem('president')

In [None]:
from gensim.parsing.preprocessing import PorterStemmer, remove_stopwords
stemmer = PorterStemmer()
stemmer.stem_documents(['This movie is good', 'this movie is really pathetic', 'awesome movie'])

In [None]:
remove_stopwords('this movie is really pathetic')

In [None]:
docs = imdb['review'].str.lower().str.replace('[^a-z\s]', '')
docs = docs.apply(remove_stopwords)
docs = stemmer.stem_documents(docs)

In [None]:
train_docs, test_docs = train_test_split(pd.Series(docs), test_size=0.2, random_state=1)


vectorizer = CountVectorizer(min_df=5).fit(train_docs)
vocab = vectorizer.get_feature_names()

train_dtm = vectorizer.transform(train_docs)
test_dtm = vectorizer.transform(test_docs)

In [None]:
df_train_dtm = pd.DataFrame(train_dtm.toarray(), index=train_docs.index, columns=vocab)
df_test_dtm = pd.DataFrame(test_dtm.toarray(), index=test_docs.index, columns=vocab)

In [None]:
df_train_dtm.head()

In [None]:
vectorizer = TfidfVectorizer(min_df=5).fit(train_docs)
vocab = vectorizer.get_feature_names()

train_dtm_tfidf = vectorizer.transform(train_docs)
test_dtm_tfidf = vectorizer.transform(test_docs)

df_train_dtm_tfidf = pd.DataFrame(train_dtm_tfidf.toarray(), index=train_docs.index, columns=vocab)
df_train_dtm_tfidf.head()

In [None]:
vectorizer = CountVectorizer(min_df=5, ngram_range=(1,3)).fit(train_docs)
vocab = vectorizer.get_feature_names()
vocab[:5]

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

doc = imdb['review'].iloc[0]

proc_doc = nlp(doc)
for token in proc_doc:
    print(token, '|', token.lemma_, '|', token.pos_)


In [None]:
!pip install wordcloud