# Natural Language Processing

## Importing the libraries

In [51]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [52]:
dataset = pd.read_csv('train.csv', quoting=0, quotechar='"')

In [53]:
dataset_test = pd.read_csv('test.csv', quoting=0, quotechar='"')

In [54]:
dataset['keyword'] = dataset['keyword'].fillna('unknown')
dataset['location'] = dataset['location'].fillna('unknown')

In [55]:
dataset_test['keyword'] = dataset_test['keyword'].fillna('unknown')
dataset_test['location'] = dataset_test['location'].fillna('unknown')

## Cleaning the texts

In [56]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, len(dataset)):
  text = re.sub('[^a-zA-Z]', ' ', dataset['text'][i])
  text = text.lower()
  text = text.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  text = [ps.stem(word) for word in text if not word in set(all_stopwords)]
  text = ' '.join(text)
  corpus.append(text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [57]:
corpus_test = []
for i in range(0, len(dataset_test)):
  text = re.sub('[^a-zA-Z]', ' ', dataset_test['text'][i])
  text = text.lower()
  text = text.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  text = [ps.stem(word) for word in text if not word in set(all_stopwords)]
  text = ' '.join(text)
  corpus_test.append(text)

## Creating the Bag of Words model

In [58]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X_train = cv.fit_transform(corpus).toarray()
y_train = dataset.iloc[:, -1].values

In [59]:
X_test = cv.transform(corpus_test).toarray()

## Training the Naive Bayes model on the Training set

In [60]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

## Predicting the Test set results

In [61]:
dataset_Id = pd.read_csv("test.csv")
dataset_Id = dataset_Id.iloc[:, 0].values

In [62]:
y_pred = classifier.predict(X_test)

In [63]:
output = pd.DataFrame({
    "id": dataset_Id,
    "target": y_pred
})

In [64]:
print(output)

         id  target
0         0       0
1         2       0
2         3       0
3         9       1
4        11       1
...     ...     ...
3258  10861       1
3259  10865       0
3260  10868       0
3261  10874       1
3262  10875       1

[3263 rows x 2 columns]


In [65]:
df = pd.DataFrame(output)
df.to_csv("output.csv", index = False)