In [1]:
import helper
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
from sklearn.feature_extraction.text import CountVectorizer

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer


from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Konstantin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Making adjustments to the the data frame as there were errors.

In [2]:
### Split the df into 2 columns

df = pd.read_csv('./data/SmsCollection.csv', sep='\t')
# df['label;text'].str.split(';')
df['Labels'] = df['label;text'].str.split(';', n=1, expand=True)[0]
df['label;text'] = df['label;text'].str.replace('ham;|spam;', '')
df.rename(columns = {'label;text':'Texts'}, inplace=True)

  df['label;text'] = df['label;text'].str.replace('ham;|spam;', '')


In [3]:
# This should be now correct
df = df[['Labels', 'Texts']]
df.describe()

Unnamed: 0,Labels,Texts
count,5574,5574
unique,2,5171
top,ham,"Sorry, I'll call later"
freq,4827,30


In [4]:
df.head()

Unnamed: 0,Labels,Texts
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
# How many are spam and how many are ham
print(df['Labels'].value_counts())


ham     4827
spam     747
Name: Labels, dtype: int64


### Cleaning up the data

#### What snowball stemmer does: 
It reduced a word to its base root/word so that words of similar kind are under a common stem. EXAMPLE: 'care', 'cared' and 'caring' are all under the same stem of 'care'

#### What stopwords do:
The stopwords in nltk are the most common words in data. They are words that you do not want to use to describe the topic of your content. They are pre-defined and cannot be removed.

### Tokenisation:
So I read around and found that in text mining, tokenisation is process of breaking text into parts: below is the attempt at this process

In [6]:
# setting up the NLTK snowballstemmer to English
st = SnowballStemmer('english')
# setting the NLTK stopword list to English
stops = set(stopwords.words('english'))

"""Used list comprehensions that I saw online"""

# Changing to lower case and removing spaces 
df['Clean texts'] = df['Texts'].apply(lambda x: x.lower().strip())

# remove extra spaces in between
df['Clean texts'] = df['Clean texts'].apply(lambda x: re.sub(' +', ' ', x))

# remove punctuation
df['Clean texts'] = df['Clean texts'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))

# remove stopwords and get the stem
df['Clean texts'] = df['Clean texts'].apply(lambda x: ' '.join(st.stem(text) for text in x.split() if text not in stops))

# Just in case this is needed, I split the Clean texts column into lists by word, but this causes and error with vectorisation
#df['Clean texts'] = df['Clean texts'].apply(lambda x: x.split(' '))
df

Unnamed: 0,Labels,Texts,Clean texts
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri wkli comp win fa cup final tkts st ...
3,ham,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though
...,...,...,...
5569,spam,This is the 2nd time we have tried 2 contact u...,nd time tri contact u u pound prize claim easi...
5570,ham,Will ü b going to esplanade fr home?,b go esplanad fr home
5571,ham,"Pity, * was in mood for that. So...any other s...",piti mood suggest
5572,ham,The guy did some bitching but I acted like i'd...,guy bitch act like interest buy someth els nex...


### Vectorisation:
The conventional process of dealing with Text data says that after cleaning the data, you can then proceed to Vectorisation: which is basically converting the text data into numerical data.
Here is the link: 
https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [13]:
# vectoriser = CountVectorizer()
# X = vectoriser.fit_transform(df['Clean texts'])
# vectoriser.get_feature_names()
# print(X.toarray().shape)
# print(X.toarray())

vectoriser = CountVectorizer()
X = vectoriser.fit(df['Clean texts'])
# print(X.vocabulary_) # Prints how many times each word appears in Clean Texts

# Making a vector:
vector = X.transform(df['Clean texts'])
print(vector.shape)
print(vector.toarray()) # Not sure if this is correct

(5574, 6292)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


### Models and tests