# Natural Language Processing

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [31]:
dataset = pd.read_csv('gradez_dataset.tsv', delimiter = '\t', quoting = 3, encoding='utf-8')

In [30]:
print(dataset['Marks'])

0       1
1       1
2       1
3       1
4       1
       ..
191    10
192    10
193    10
194    10
195    10
Name: Marks, Length: 196, dtype: int64


## Cleaning the texts

In [20]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 196):
  answer = re.sub('[^a-zA-Z]', ' ', dataset['Answer'][i])
  answer = answer.lower()
  answer = answer.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  answer = [ps.stem(word) for word in answer if not word in set(all_stopwords)]
  answer = ' '.join(answer)
  corpus.append(answer)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shriraj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
print(corpus)

l focu becom reduc line code reduc number bug report increas number softwar iter speed complet task focus metric target help softwar develop reach import goal improv softwar use user experi standard definit softwar metric valu softwar develop team softwar metric differ valu differ team depend goal softwar develop team', 'softwar metric standard measur contain mani activ involv degre measur classifi three categori product metric process metric project metric product metric describ characterist product size complex design featur perform qualiti level process metric use improv softwar develop mainten exampl includ effect defect remov develop pattern test defect arriv respons time fix process project metric describ project characterist execut exampl includ number softwar develop staf pattern life cycl softwar cost schedul product metric belong multipl categori exampl process qualiti metric project process metric project metric cost effort estim effort express function one variabl size prog

## Creating the Bag of Words model

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

## Splitting the dataset into the Training set and Test set

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

## Training the Naive Bayes model on the Training set

In [24]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB()

## Predicting the Test set results

In [25]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[ 8  5]
 [ 2  1]
 [ 3  2]
 [10  6]
 [ 9  8]
 [ 5  7]
 [10  8]
 [10  7]
 [ 3  4]
 [ 2  1]
 [ 2  1]
 [ 2  1]
 [10  7]
 [ 7  8]
 [ 9  8]
 [ 9  9]
 [ 6  6]
 [ 8  4]
 [ 4  3]
 [ 9  7]
 [10  3]
 [ 9  9]
 [10  7]
 [ 8  5]
 [ 8  4]
 [ 8  3]
 [ 7  6]
 [10  8]
 [10  9]
 [ 5  3]
 [ 3  2]
 [ 6  8]
 [ 3  4]
 [ 8  2]
 [ 4  1]
 [ 8  5]
 [ 8  9]
 [ 5  6]
 [ 4  8]
 [ 5  2]]


## Making the Confusion Matrix

In [26]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[0 4 0 1 0 0 0 0 0 0]
 [0 0 2 0 1 0 0 1 0 0]
 [0 0 0 1 1 0 0 1 0 1]
 [0 0 2 0 0 0 0 2 0 0]
 [0 0 0 0 0 0 0 3 0 0]
 [0 0 0 0 1 1 1 0 0 1]
 [0 0 0 0 1 0 0 0 1 3]
 [0 0 0 1 0 1 1 0 2 2]
 [0 0 0 0 0 0 0 1 2 1]
 [0 0 0 0 0 0 0 0 0 0]]


0.075