# Drive Mount

In [0]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

In [0]:
import os
os.chdir("gdrive/Shared drives/Large Scale Analytics")


In [0]:
%matplotlib inline
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud, ImageColorGenerator
from nltk import FreqDist
#split Data
from sklearn.model_selection import train_test_split
#Spacy for "# Tokenization for the Clened Data"
import spacy
from tqdm import tqdm
nlp = spacy.load('en_core_web_sm',disable=['parser', 'ner'])
#CSR Matrix
from sklearn.feature_extraction.text import TfidfVectorizer
#pickel dump
import pickle


# Reading Dataset

In [4]:
df_happy = pd.read_csv("happydb/cleaned_hm.csv")
df_happy_moment=df_happy[df_happy["cleaned_hm"].notnull()]
df_temp=df_happy_moment[["cleaned_hm", "predicted_category"]]
df_temp = df_temp.dropna()
df_temp.head(2)


Unnamed: 0,cleaned_hm,predicted_category
0,I went on a successful date with someone I fel...,affection
1,I was happy when my son got 90% marks in his e...,affection


In [5]:
print(df_temp.count())
print(df_temp.shape)

cleaned_hm            100535
predicted_category    100535
dtype: int64
(100535, 2)


# Text Pre-Processing For cleaned_hm 

In [0]:
#creating x= input variable and y = output prediction variable
df_cleaned_hm = df_temp[["cleaned_hm"]]
df_predicted_category = df_temp["predicted_category"]

In [7]:
#Sepreted Data Frames
print("df_cleaned_hm")
print(df_cleaned_hm.head(2))
print("Shape ",df_cleaned_hm.shape)
print("df_predicted_category")
print(df_predicted_category.head(2))
print("shape ",df_predicted_category.shape)

df_cleaned_hm
                                          cleaned_hm
0  I went on a successful date with someone I fel...
1  I was happy when my son got 90% marks in his e...
Shape  (100535, 1)
df_predicted_category
0    affection
1    affection
Name: predicted_category, dtype: object
shape  (100535,)


In [8]:
# split for train and test data
XTrain, XTest, yTrain, yTest = train_test_split(df_cleaned_hm, df_predicted_category, test_size = 0.2, random_state = 42)
print("XTrain count: ", XTrain.count(), " ", "yTrain count:" ,yTrain.count()  )
print("XTest count: ", XTest.count(), " ", "yTest count:" ,yTest.count()  )
# XTrain.head()


XTrain count:  cleaned_hm    80428
dtype: int64   yTrain count: 80428
XTest count:  cleaned_hm    20107
dtype: int64   yTest count: 20107


In [9]:
# All different Type of "Prediction Categories"  = 7 in this DataSet
df_predicted_category.value_counts()

affection           34168
achievement         33993
enjoy_the_moment    11144
bonding             10727
leisure              7458
nature               1843
exercise             1202
Name: predicted_category, dtype: int64

In [0]:
# cleaning data using regex
clean_1 = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])")
clean_2 = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

In [0]:
# function to clean text data
def clean_desc(desc):
    desc = [clean_1.sub("", line.lower()) for line in desc]
    desc = [clean_2.sub(" ", line) for line in desc]
    return desc

In [12]:
# cleaning regex and lowering the case
XTrain['cleaned_hm'] = clean_desc(XTrain['cleaned_hm']) 
XTest['cleaned_hm'] = clean_desc(XTest['cleaned_hm'])
print(XTrain.head(2))
print(XTest.head(2))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


                                              cleaned_hm
68453  i had a nice steak dinner with my wife and fri...
39045  an event that made me happy was when my childr...
                                              cleaned_hm
10484  i enjoyed spending the day at a craft show wit...
40401  i had a good evaluation at work and received a...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


# Tokenization for the Clened Data

In [0]:
# tokenization using spaCy

# Function 

def tokenization(x):
    desc_tokens = []
    for i in tqdm(x):
        i = nlp(i)
        temp = []
        for j in i:
            temp.append(j.text)
        desc_tokens.append(temp)
    
    return desc_tokens

In [14]:
#Tokenization for Train data
XTrain['cleaned_hm'] = tokenization(XTrain['cleaned_hm'])
# XTrain.head()
XTest['cleaned_hm'] = tokenization(XTest['cleaned_hm'])
# XTest.head()

100%|██████████| 80428/80428 [03:08<00:00, 426.89it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
100%|██████████| 20107/20107 [00:46<00:00, 433.75it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [0]:
#Removing Stop Words
# function to remove stopwords
def remove_stopwords(desc):
    s = []
    for r in tqdm(desc):
        s_2 = []
        for token in r:
            if nlp.vocab[token].is_stop == True:
                continue
            else:
                s_2.append(token)
        s.append(" ".join(s_2))    
        
    return s

In [16]:
XTrain['cleaned_hm'] = remove_stopwords(XTrain['cleaned_hm'])
XTest['cleaned_hm'] = remove_stopwords(XTest['cleaned_hm'])

100%|██████████| 80428/80428 [00:01<00:00, 74592.74it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
100%|██████████| 20107/20107 [00:00<00:00, 78864.36it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [0]:
# function to lemmatize text
def lemmatization(texts):
    output = []
    for i in texts:
             s = [token.lemma_ for token in nlp(i)]
             output.append(' '.join(s))
    return output

In [18]:
XTrain['cleaned_hm'] = lemmatization(XTrain['cleaned_hm'])
XTest['cleaned_hm'] = lemmatization(XTest['cleaned_hm'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


# Pickel Dump For Train and Test DataFrame

In [0]:
pickle.dump( XTrain, open( "XTrain_Cleaned_hm_11_15.pkl", "wb" ) )
pickle.dump( XTest, open( "XTest_Cleaned_hm_11_15.pkl", "wb" ) )
pickle.dump( yTrain, open( "yTrain_prediction_11_15.pkl", "wb" ) )
pickle.dump( yTest, open( "yTest_prediction_11_15.pkl", "wb" ) )
pickle.dump( df_happy_moment, open( "df_happy_moment_11_15.pkl", "wb" ) )

# CSR Matrix Creation

In [0]:
# # build TF-IDF features for train data
cv = TfidfVectorizer(use_idf=True, min_df=3, max_df=0.5, ngram_range=(1,2),sublinear_tf=True,max_features=5000)
cv_train = cv.fit_transform(XTrain['cleaned_hm'])
# print(type(XTest))
# print(XTest)
cv = TfidfVectorizer(use_idf=True, min_df=3, max_df=0.5, ngram_range=(1,2),
                        sublinear_tf=True,max_features=5000)
cv_test = cv.fit_transform(XTest['cleaned_hm'])

In [27]:
print(type(cv_train))
print(cv_train.shape)
print(cv_train[2957])
# cv_train[0,1649]
cv_train[0,4185]
cv_test[20103,2059]

<class 'scipy.sparse.csr.csr_matrix'>
(80428, 5000)
  (0, 2718)	0.7329842634211037
  (0, 2886)	0.5232228270386609
  (0, 1063)	0.43470903239143044


0.5579723084930774

# Pickel Dump For CSR Matrix

In [0]:
pickle.dump( cv_train, open( "cv_train_CSR_Matrix_11_15.pkl", "wb" ) )
pickle.dump( cv_test, open( "cv_test_CSR_Matrix_11_15.pkl", "wb" ) )

# Pickel Dataset read

In [0]:
df_happy_moment = pd.read_pickle("./df_happy_moment_11_15.pkl")
XTrain = pd.read_pickle("./XTrain_Cleaned_hm_11_15.pkl")
XTest = pd.read_pickle("./XTest_Cleaned_hm_11_15.pkl")
yTrain = pd.read_pickle("./yTrain_prediction_11_15.pkl")
yTest = pd.read_pickle("./yTest_prediction_11_15.pkl")
cv_train = pd.read_pickle("./cv_train_CSR_Matrix_11_15.pkl")
cv_test = pd.read_pickle("./cv_test_CSR_Matrix_11_15.pkl")

In [35]:
# print(XTrain.head())
# print(XTest.head(1))
# print(yTrain.head(1))
# print(yTest.head(1))
# cv_train[0,1649]
# print(cv_train[0,4185])
# print(cv_train)
# print(cv_test[20103,2059])
print("XTrain shape ", XTrain.shape)
print("yTrain shape ", yTrain.shape)
print("cv_train shape ", cv_train.shape)


XTrain shape  (80428, 1)
yTrain shape  (80428,)
cv_train shape  (80428, 5000)


# temp