# Text Classification



### Import required modules

In [26]:
import numpy as np
import pandas as pd
import os
import re

### Data pre-processing

In [27]:
data=pd.read_csv("SMSSpamCollection.csv",delimiter="\t") #Reading data
data.columns=['Type','Text'] #Changing Column names 
type(data) #Type is a data frame
data.head(5) #Displays first five records

Unnamed: 0,Type,Text
0,ham,Ok lar... Joking wif u oni...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,U dun say so early hor... U c already then say...
3,ham,"Nah I don't think he goes to usf, he lives aro..."
4,spam,FreeMsg Hey there darling it's been 3 week's n...


In [28]:
Text_data = data['Text']
type(data)


pandas.core.frame.DataFrame

In [29]:
Text_data1 = Text_data.tolist()
#Some text preprocessing- Removing extra spaces and numbers
for i in range(len(Text_data1)):
    Text_data1[i]=re.sub(r'\s+', ' ', Text_data1[i]) #Removing more than one white spaces in the sentence
    Text_data1[i]=re.sub('[\d]', '',Text_data1[i]) #Removing numbers


In [30]:
Text_data[8]

'Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030'

In [31]:
#Text_data1 = Text_data.tolist()
Text_data1[8]

'Had your mobile  months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on '

### Split the data into train and test

In [14]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test=train_test_split(Text_data1,data['Type'],test_size=0.3,random_state=1234)

In [15]:
#Ensure similar distribution of target classes in both train and test
y_train.value_counts()

ham     3378
spam     521
Name: Type, dtype: int64

In [16]:
y_test.value_counts()

ham     1446
spam     226
Name: Type, dtype: int64

### Construct tf-idf matrix

In [17]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
#count=CountVectorizer(strip_accents='unicode',ngram_range=(1,1),stop_words='english')
#tfreq=count.fit_transform(Text_data1)
#Dense_mat=tfreq.todense()
#A=pd.DataFrame(Dense_mat,columns=count.get_feature_names())

In [24]:
#Lets construct TF-idf matrix based on the train documents
tfidf_transformer = TfidfVectorizer(ngram_range=(1,2),stop_words='english')
X_train_tfidf = tfidf_transformer.fit_transform(X_train)
X_train_tfidf.shape

(3899, 26575)

In [None]:
#??tfidf_transformer.fit #"""Learn vocabulary and idf from training set.
#??tfidf_transformer.transform # """Transform documents to document-term matrix.
#Uses the vocabulary and document frequencies (df) learned by fit (or fit_transform)

In [None]:
# Get the tfidf matrix for test documents
X_test_tfidf = tfidf_transformer.transform(X_test)
X_test_tfidf.shape

### Implementing Naive Bayes Classification model

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb_clf = MultinomialNB().fit(X_train_tfidf, y_train) #nb_clf is the classifier
pred_train = nb_clf.predict(X_train_tfidf) #get predictions on train data

from sklearn.metrics import confusion_matrix
prediction=confusion_matrix(y_train, pred_train) #get confusion matrix for train predictions


##### Metrics on train data

In [None]:
#Metrics
print("Train_Confusion Matrix:\n",prediction,"\n")

from sklearn.metrics import recall_score, precision_score, accuracy_score

acc = accuracy_score(y_train, pred_train)
rec = recall_score(y_train, pred_train, pos_label='spam')
prec = precision_score(y_train, pred_train, pos_label='spam')

print("Results of Naive Bayes on train data:","\nAccuracy:",acc, "\nRecall:",rec, "\nPrecision:",prec)

##### Predict on test data

In [None]:
pred = nb_clf.predict(X_test_tfidf) #predict on test data
prediction = confusion_matrix(y_test,pred) ##get confusion matrix for test predictions

In [None]:
print("Test_Confusion matrix:\n", prediction,"\n")
acc = accuracy_score(y_test,pred)
rec = recall_score(y_test,pred,pos_label='spam')
prec=precision_score(y_test,pred,pos_label='spam')
print("Results of Naive Bayes on test data:","\nAcc: ",acc,"\nRec:", rec, "\nPrec:", prec)

### Implementing Logistic Regression

In [None]:
from sklearn import linear_model

logreg_train = linear_model.LogisticRegression()
lr_clf = logreg_train.fit(X_train_tfidf, y_train) #fit logreg model on train data

pred_lr_train=lr_clf.predict(X_train_tfidf) # predict on train data
pred_lr_test=lr_clf.predict(X_test_tfidf) # predict on test data


print("Train_Confusion matrix:\n", confusion_matrix(y_train,pred_lr_train))
print("Test_Confusion matrix:\n",confusion_matrix(y_test,pred_lr_test))

In [None]:
print("Results of Logistic regression on test data")
acc=accuracy_score(y_test,pred_lr_test)
rec=recall_score(y_test,pred_lr_test,pos_label='spam')
prec=precision_score(y_test,pred,pos_label='spam')
print("\nAcc: ",acc,"\nRec:", rec, "\nPrec:", prec)