# Natural Language Processing

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.ensemble import VotingClassifier

## Importing the dataset

In [None]:
dataset = pd.read_csv('/content/Train - Email Classification.csv')

## Cleaning the texts

In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
X = []
y = []

for i in range(0, 1971):
    combined_text = dataset['pre_text'][i] + " " + dataset['post_text'][i]
    review = re.sub('[^a-zA-Z]', ' ', combined_text)
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review) # replace your_label_value_here with the actual label for this data point

# Now X contains the processed combined texts, and y contains the corresponding labels


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
print(corpus)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



## Creating the Bag of Words model

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

y = le.fit_transform(y)

In [None]:
le.classes_

array(['Energy', 'Finance', 'Pharmaceutical', 'Technology', 'Travel'],
      dtype=object)

In [None]:
encoded_values = le.transform(le.classes_)

# Print the names and corresponding values
for name, value in zip(le.classes_, encoded_values):
    print(f"{name}: {value}")

Energy: 0
Finance: 1
Pharmaceutical: 2
Technology: 3
Travel: 4


In [None]:
print(y)

[2 0 1 ... 0 0 1]


In [None]:
y[:5]

array([2, 0, 1, 3, 1])

In [None]:
X.shape

(1971, 1500)

In [None]:
y = y[:1971]
y.shape

(1971,)

## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

## Training the Naive Bayes model on the Training set

In [None]:
from xgboost import XGBClassifier
classifi = XGBClassifier()
classifi.fit(X_train, y_train)

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier1 = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier1.fit(X_train, y_train)

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

## Predicting the Test set results

In [None]:
y_pred = classifi.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[1 1]
 [4 4]
 [0 0]
 [1 1]
 [4 4]
 [1 1]
 [3 3]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [2 2]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [3 3]
 [1 1]
 [4 4]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [3 3]
 [3 3]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [3 3]
 [1 1]
 [2 2]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [3 3]
 [2 2]
 [1 1]
 [3 3]
 [3 3]
 [1 1]
 [0 2]
 [1 1]
 [0 0]
 [1 1]
 [2 2]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [4 4]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [3 3]
 [0 0]
 [1 1]
 [3 3]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [2 2]
 [4 4]
 [1 1]
 [0 0]
 [1 1]
 [2 2]
 [0 0]
 [1 1]
 [3 3]
 [1 1]
 [0 0]
 [4 4]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [2 2]
 [2 2]
 [1 1]
 [3 3]
 [1 1]
 [3 3]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [2 2]
 [0 0]
 [2 2]
 [1 1]
 [0 0]
 [0 0]
 [2 2]
 [2 2]
 [2 2]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [4 4]
 [3 3]
 [3 3]
 [0 0]
 [1 1]

## Making the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[108   1   0   0   0]
 [  0 185   0   0   0]
 [  3   0  36   0   0]
 [  0   5   0  42   0]
 [  0   0   0   0  15]]


0.9772151898734177

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifi, X = X_test, y = y_test, cv = 20)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))



Accuracy: 94.46 %
Standard Deviation: 4.15 %


In [None]:
!pip install joblib



In [None]:
import joblib
# Save the trained model to a file

joblib.dump(classifi, 'xgb_model.pkl')

['xgb_model.pkl']

In [None]:
df = pd.read_csv('/content/Test - Email Classification.csv')
X1 = df[['post_text','pre_text']]
print(X1)

                                             post_text  \
0    ['credit lines devon has a $ 3.0 billion senio...   
1    ['asset retirement obligations as of december ...   
2                                                ['.']   
3    ['leverage ( 1 ) 6.08 4.03 ( 1 ) tier 1 capita...   
4    ['the majority of unused commitments are conti...   
..                                                 ...   
536  ['( 1 ) includes $ 164 million in 2016 related...   
537  ['the total fair value of restricted stock tha...   
538  ['long-term debt at december 31 , 2008 and dec...   
539  ['the vesting date fair value of restricted st...   
540  ['class e preferred stock 8 7/8% ( 7/8 % ) 2.2...   

                                              pre_text  
0    ['devon energy corporation and subsidiaries no...  
1    ['marathon oil corporation notes to consolidat...  
2    ['dividends for a summary of the cash dividend...  
3    ['capital resources and liquidity capital reso...  
4    ['credit comm

In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []

for i in range(0, 541):
    combined_text = df['pre_text'][i] + " " + df['post_text'][i]
    review = re.sub('[^a-zA-Z]', ' ', combined_text)
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review) # replace your_label_value_here with the actual label for this data point

# Now X contains the processed combined texts, and y contains the corresponding labels



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X1 = cv.fit_transform(corpus).toarray()

In [None]:
print(X1)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [None]:
predicted_values = classifier.predict(X1)

In [None]:
df['Predicted'] = predicted_values

In [None]:
df

Unnamed: 0,id,post_text,pre_text,table,Predicted
0,DVN/2015/page_92.pdf-4,['credit lines devon has a $ 3.0 billion senio...,['devon energy corporation and subsidiaries no...,"[array(['2016', '$ 976'], dtype=object)\n arra...",1
1,MRO/2009/page_127.pdf-3,['asset retirement obligations as of december ...,['marathon oil corporation notes to consolidat...,"[array(['( in millions )', '2009', '2008'], dt...",1
2,C/2010/page_306.pdf-1,['.'],['dividends for a summary of the cash dividend...,"[array(['december 31,', 'citigroup', 's&p 500 ...",1
3,C/2008/page_100.pdf-1,['leverage ( 1 ) 6.08 4.03 ( 1 ) tier 1 capita...,['capital resources and liquidity capital reso...,"[array(['at year end', '2008', '2007'], dtype=...",2
4,C/2018/page_296.pdf-4,['the majority of unused commitments are conti...,['credit commitments and lines of credit the t...,"[array(['in millions of dollars', 'u.s .', 'ou...",3
...,...,...,...,...,...
536,EOG/2017/page_93.pdf-2,['( 1 ) includes $ 164 million in 2016 related...,['14 .'\n 'accounting for certain long-lived a...,"[array(['', '2017', '2016'], dtype=object)\n a...",1
537,CME/2010/page_113.pdf-5,['the total fair value of restricted stock tha...,['the company granted 1020 performance shares ...,"[array(['', 'number of shares', 'weighted aver...",1
538,C/2008/page_176.pdf-1,"['long-term debt at december 31 , 2008 and dec...",['cgmhi also has substantial borrowing arrange...,"[array(['in millions of dollars', '2009', '201...",3
539,MRO/2009/page_137.pdf-2,['the vesting date fair value of restricted st...,['marathon oil corporation notes to consolidat...,"[array(['', 'awards', 'weighted-averagegrant d...",1


In [None]:
df = df.drop('pre_text',	axis =1)
df

Unnamed: 0,id,Predicted
0,DVN/2015/page_92.pdf-4,1
1,MRO/2009/page_127.pdf-3,1
2,C/2010/page_306.pdf-1,1
3,C/2008/page_100.pdf-1,2
4,C/2018/page_296.pdf-4,3
...,...,...
536,EOG/2017/page_93.pdf-2,1
537,CME/2010/page_113.pdf-5,1
538,C/2008/page_176.pdf-1,3
539,MRO/2009/page_137.pdf-2,1


In [None]:
# Your dictionary
mapping = {0: 'Energy', 1: 'Finance', 2: 'Pharma', 3: 'Technology', 4: 'Travel'}
df['Predicted'] = df['Predicted'].replace(mapping)

In [None]:
df

Unnamed: 0,id,Predicted
0,DVN/2015/page_92.pdf-4,Finance
1,MRO/2009/page_127.pdf-3,Finance
2,C/2010/page_306.pdf-1,Finance
3,C/2008/page_100.pdf-1,Pharma
4,C/2018/page_296.pdf-4,Technology
...,...,...
536,EOG/2017/page_93.pdf-2,Finance
537,CME/2010/page_113.pdf-5,Finance
538,C/2008/page_176.pdf-1,Technology
539,MRO/2009/page_137.pdf-2,Finance


In [None]:
df.to_csv('test_data1.csv',index = False)

In [None]:
da = df['Predicted'].value_counts()
da

Predicted
Finance       258
Technology    140
Pharma         67
Energy         48
Travel         28
Name: count, dtype: int64