In [41]:
import pandas as pd
import re
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter
from preprocessing import preprocess_text
from sklearn.feature_extraction.text import TfidfVectorizer

## Initial load and inspection

In [42]:
#load in data, provided by Codecademy
df = pd.read_csv('./data/clean_nus_sms.csv')

In [43]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,Message,length,country,Date
0,0,10120,Bugis oso near wat...,21,SG,2003/4
1,1,10121,"Go until jurong point, crazy.. Available only ...",111,SG,2003/4
2,2,10122,I dunno until when... Lets go learn pilates...,46,SG,2003/4
3,3,10123,Den only weekdays got special price... Haiz......,140,SG,2003/4
4,4,10124,Meet after lunch la...,22,SG,2003/4


In [44]:
#drop the index column
df.drop(columns= 'Unnamed: 0', inplace = True)

In [45]:
df.shape

(48598, 5)

In [46]:
df.dtypes

id          int64
Message    object
length     object
country    object
Date       object
dtype: object

In [47]:
#make capitals lowercase, drop duplicate columns
df['message'] = df['Message']
df['date'] = df['Date']
df.drop(columns = ['Message', 'Date'], inplace = True)

In [48]:
df.head()

Unnamed: 0,id,length,country,message,date
0,10120,21,SG,Bugis oso near wat...,2003/4
1,10121,111,SG,"Go until jurong point, crazy.. Available only ...",2003/4
2,10122,46,SG,I dunno until when... Lets go learn pilates...,2003/4
3,10123,140,SG,Den only weekdays got special price... Haiz......,2003/4
4,10124,22,SG,Meet after lunch la...,2003/4


## Vectorizing messages

In [49]:
#make certain that all messages are string, for purposes of vectorizing
df['message'] = df['message'].astype(str)

In [50]:
#preprocess text from preprocessing.py, which includes tokenizing, stopword removal, etc
df['preprocessed_message'] = df['message'].apply(preprocess_text)

In [51]:
#create vectorizer and fit it
vectorizer = TfidfVectorizer(norm = None)
tfidf_scores = vectorizer.fit_transform(df['preprocessed_message'])

feature_names = vectorizer.get_feature_names()


message_index = [f"Message {i+1}" for i in range(len(df))]

In [54]:
#create a new dataframe of vectorized messages
try:
  df_tf_idf = pd.DataFrame(tfidf_scores.T.todense(), index=feature_names, columns=message_index)
  print(df_tf_idf)
except:
  pass

        Message 1  Message 2  Message 3  Message 4  Message 5  Message 6  \
00            0.0        0.0        0.0        0.0        0.0        0.0   
000           0.0        0.0        0.0        0.0        0.0        0.0   
000pes        0.0        0.0        0.0        0.0        0.0        0.0   
001           0.0        0.0        0.0        0.0        0.0        0.0   
00am          0.0        0.0        0.0        0.0        0.0        0.0   
...           ...        ...        ...        ...        ...        ...   
鞛堨姷雼堧嫟        0.0        0.0        0.0        0.0        0.0        0.0   
頃滉淡鞏措ゼ        0.0        0.0        0.0        0.0        0.0        0.0   
饮水思源          0.0        0.0        0.0        0.0        0.0        0.0   
骨奶            0.0        0.0        0.0        0.0        0.0        0.0   
鸿兔大展          0.0        0.0        0.0        0.0        0.0        0.0   

        Message 7  Message 8  Message 9  Message 10  ...  Message 48589  \
00          

In [55]:
#transpose the new dataframe so that each row is a message
df_tf_idf = df_tf_idf.T

In [56]:
#create column of countries for each message
df_tf_idf = df_tf_idf.assign(country = df['country'].tolist())

## Quick confirmation of accuracy

In [57]:
#this is what the new df looks like
df_tf_idf['country']

Message 1            SG
Message 2            SG
Message 3            SG
Message 4            SG
Message 5            SG
                  ...  
Message 48594    Serbia
Message 48595    Serbia
Message 48596    Serbia
Message 48597    Serbia
Message 48598    Serbia
Name: country, Length: 48598, dtype: object

In [58]:
#this country column should match the first five rows of the above cell
df.head()

Unnamed: 0,id,length,country,message,date,preprocessed_message
0,10120,21,SG,Bugis oso near wat...,2003/4,bugis oso near wat
1,10121,111,SG,"Go until jurong point, crazy.. Available only ...",2003/4,go until jurong point crazy available only in ...
2,10122,46,SG,I dunno until when... Lets go learn pilates...,2003/4,i dunno until when let go learn pilate
3,10123,140,SG,Den only weekdays got special price... Haiz......,2003/4,den only weekday get special price haiz cant e...
4,10124,22,SG,Meet after lunch la...,2003/4,meet after lunch la


In [59]:
#this country column should match the last five rows of the above cell
df.tail()

Unnamed: 0,id,length,country,message,date,preprocessed_message
48593,45712,21,Serbia,Come to me AFTER NOON,2015/3,come to me after noon
48594,45714,14,Serbia,I LOVE YOU TOO,2015/3,i love you too
48595,45715,4,Serbia,C-YA,2015/3,c ya
48596,45717,11,Serbia,BE MY GUEST,2015/3,be my guest
48597,45718,21,Serbia,MANY MANY MANY PEOPLE,2015/3,many many many people


In [61]:
#save new dataframe to a compressed file for the modeling step
df_tf_idf.to_csv('./data/df_tf_idf.zip', index = False)