# Importing Libraries

In [1]:
import numpy as np
import math
import seaborn as sns
from sklearn import svm
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import *
from sklearn.preprocessing import MinMaxScaler
from sklearn import linear_model
from sklearn.metrics import *

# Loading the dataset 

In [6]:
df = pd.read_csv('df_orignal.csv')

In [7]:
#dropping NaN values
df.dropna(subset = ["title","description"], inplace=True)
df.reset_index(drop=True,inplace=True)

# Getting all the features separately

In [8]:
df_link = pd.DataFrame(columns = ["link"])        
df_title = pd.DataFrame(columns = ["title"])        
df_description = pd.DataFrame(columns = ["description"])        
df_category = pd.DataFrame(columns = ["category"])        
df_link['link'] = df['link'] 
df_title ['title']= df['title'] 
df_description['description'] = df['description'] 
df_category['category'] = df['category']

# Importing libraries for data cleaning 

In [9]:
import re 
import nltk
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer

# Cleaning the data and storing it into a list 

In [10]:
corpus = []        
for i in range(0, df.shape[0]):         
  review = re.sub('[^a-zA-Z]', ' ', df_title['title'][i])            
  review = review.lower()            
  review = review.split()            
  ps = PorterStemmer()            
  review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]            
  review = ' '.join(review)            
  corpus.append(review)

In [11]:
corpus1 = [] 
for i in range(0, df.shape[0]):            
  review = re.sub('[^a-zA-Z]', ' ', df_description['description'][i])            
  review = review.lower()            
  review = review.split()            
  ps = PorterStemmer()            
  review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]            
  review = ' '.join(review)            
  corpus1.append(review)

# Performing label encoding on the category feature

In [12]:
from sklearn.preprocessing import LabelEncoder
dfcategory = df_category.apply(LabelEncoder().fit_transform)

In [13]:
# Creating a new dataset after cleaning the data and label encoding the categories
df_new = pd.concat([df_link, df_title, df_description, df_category], axis=1)##, join_axes = [df_link.index])

# Creating the bag of words model using countVectorizer

In [14]:
from sklearn.feature_extraction.text import CountVectorizer   
cv = CountVectorizer(max_features = 1500,ngram_range=(1,2)) 
X = cv.fit_transform(corpus, corpus1).toarray() 
y = df_new.iloc[:, 3].values

# Splitting the dataset into the Training set and Test set 

In [44]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

# Random Forest 

In [45]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 1000, criterion = 'entropy')
classifier.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=1000)

In [46]:
y_pred = classifier.predict(X_test)

In [47]:
classifier.score(X_test, y_test)

0.9900990099009901

In [48]:
print(classification_report(y_test, y_pred))

               precision    recall  f1-score   support

        Dance       1.00      1.00      1.00        21
         Food       1.00      1.00      1.00        15
      History       0.94      1.00      0.97        15
Manufacturing       1.00      1.00      1.00        18
      Science       1.00      0.92      0.96        13
       Travel       1.00      1.00      1.00        19

     accuracy                           0.99       101
    macro avg       0.99      0.99      0.99       101
 weighted avg       0.99      0.99      0.99       101



In [49]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[21,  0,  0,  0,  0,  0],
       [ 0, 15,  0,  0,  0,  0],
       [ 0,  0, 15,  0,  0,  0],
       [ 0,  0,  0, 18,  0,  0],
       [ 0,  0,  1,  0, 12,  0],
       [ 0,  0,  0,  0,  0, 19]], dtype=int64)

# SVM 

In [50]:
from sklearn.svm import SVC
classifier1 = SVC(kernel = 'linear', random_state = 0)
classifier1.fit(X_train, y_train)

SVC(kernel='linear', random_state=0)

In [51]:
y_pred1 = classifier1.predict(X_test)

In [52]:
classifier1.score(X_test, y_test)

0.9702970297029703

In [53]:
# Making the Confusion Matrix
cm1 = confusion_matrix(y_test, y_pred1)
cm1

array([[21,  0,  0,  0,  0,  0],
       [ 0, 15,  0,  0,  0,  0],
       [ 1,  0, 14,  0,  0,  0],
       [ 0,  1,  0, 17,  0,  0],
       [ 1,  0,  0,  0, 12,  0],
       [ 0,  0,  0,  0,  0, 19]], dtype=int64)

# Naive Bayes 

In [54]:
from sklearn.naive_bayes import GaussianNB
classifier2 = GaussianNB()
classifier2.fit(X_train, y_train)

GaussianNB()

In [55]:
y_pred2 = classifier2.predict(X_test)

In [56]:
classifier2.score(X_test, y_test)

0.9306930693069307

In [57]:
# Making the Confusion Matrix
cm2 = confusion_matrix(y_test, y_pred2)
cm2

array([[20,  0,  1,  0,  0,  0],
       [ 0, 13,  0,  2,  0,  0],
       [ 0,  0, 14,  1,  0,  0],
       [ 0,  0,  1, 17,  0,  0],
       [ 0,  0,  1,  0, 12,  0],
       [ 0,  0,  0,  0,  1, 18]], dtype=int64)