In [1]:
!cp drive/My\ Drive/natural-language-processing/q3_train.zip .

In [2]:
!unzip q3_train.zip

Archive:  q3_train.zip
   creating: train/
  inflating: train/test.csv          
  inflating: train/train.csv         


#### Importing libraries

In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import Counter
import re
from nltk.corpus import stopwords
import string
from keras.utils import to_categorical
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
tqdm.pandas()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


#### Loading dataset

In [4]:
df = pd.read_csv('train/train.csv')
df_test = pd.read_csv('train/test.csv')

In [6]:
df.head()

Unnamed: 0,Text,Class
0,I do.,0
1,What if anything was decided about whether I s...,0
2,H: If Roubini is right and he's been mostly ri...,0
3,HRC: Below is an oped on the National Security...,0
4,DQoNCg0KDQoNCg0KDQpHb29kIERheSwNCk1heSBpdCBub3...,1


In [7]:
df_test.head()

Unnamed: 0,Text
0,"FROM BONGO WALEJOHANNESBURG,SOUTH AFRICA.TELL:..."
1,MR=2E DONALD COLLINSCREDIT MUTUEL DU SENEGAL B...
2,Meant to write that Solomon is a tough critic.
3,Well well
4,#226=2C TAYO RIMI=2CMEDINA=2CDAKAR=2C SENEGAL=...


#### Checking Nan values

In [10]:
df.isna().sum()

Text     1
Class    0
dtype: int64

#### Data Pre-processing
- Nan values removal
- Normalization
- Punctuation removal
- Stop words removal
- Lemmatization
- Removing words having length of less than two
- Tokenization


In [11]:
df.dropna(inplace=True)

In [12]:
df['Class'] = df['Class'].astype(int)

In [13]:
def clean_data(text):
   lower = text.lower()
   splitted = lower.split()
   re_punc = re.compile('[%s]' % re.escape(string.punctuation))
   tokens = [re_punc.sub('',w) for w in splitted]
   tokens = [word for word in tokens if word.isalpha()]
   stop_words = set(stopwords.words('english'))
   tokens = [w for w in tokens if not w in stop_words]
   lemmeted = [WordNetLemmatizer().lemmatize(w) for w in tokens]
   tokens = [word for word in lemmeted if len(word) > 2]
   return tokens

#### Train/Test split

In [14]:
x_train, x_val, y_train, y_val = train_test_split(df['Text'], df['Class'])
vocab = Counter()
for index, row in x_train.iteritems():
  vocab.update(clean_data(row))

#### Most common words in the vocabulary

In [15]:
vocab.most_common(10)

[('money', 6747),
 ('account', 5421),
 ('bank', 5282),
 ('fund', 4782),
 ('business', 3069),
 ('country', 2993),
 ('next', 2730),
 ('transaction', 2635),
 ('transfer', 2496),
 ('want', 2494)]

In [16]:
vocab_size = len(vocab)
print("Vocabulary Size is: ", vocab_size)

Vocabulary Size is:  67860


#### Feature extraction 
- TF-IDF Vectorizer

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

v = TfidfVectorizer(vocabulary=vocab.keys())
x_train = v.fit_transform(x_train)
x_val = v.fit_transform(x_val)


#### Model preparation
- SVM with Linear kernel
- Naive Bayes

In [18]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report
from sklearn.svm import SVC

# probability=True -> predict_proba
model = SVC(kernel='linear', probability=True)
model.fit(x_train, y_train)
pred = model.predict(x_val)
print(classification_report(y_val, pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.99      1378
           1       1.00      0.96      0.98      1008

    accuracy                           0.98      2386
   macro avg       0.99      0.98      0.98      2386
weighted avg       0.98      0.98      0.98      2386



#### Evaluation & Prediction
- Transforming test set for feature extraction
- Submiting to a CSV file


In [19]:
x_test = v.transform(df_test['Text'].values)

In [20]:
y_pred_prob = model.predict_proba(x_test)

In [21]:
d = pd.DataFrame(y_pred_prob.argmax(axis=1), columns=['Class'])
d.to_csv('submit.csv', index=False)