In [64]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [65]:
# read train and test datasets
train=pd.read_csv('../input/data1111/training_data.csv')
test=pd.read_csv('../input/data1111/testing_data.csv')

In [66]:
# check train data
train.head()

Unnamed: 0,title,category
0,The Three Amigos,
1,Home Essentials Blue Floral Glass Vintage Styl...,Home & Kitchen
2,Cooper Wiring Quiet Toggle Switch Single Pole ...,Tools & Home Improvement
3,Baseboarders&reg; Wall Brackets,Tools & Home Improvement
4,The Great Wave Off Kanagawa Custom Rectangle M...,Office Products


In [86]:
# check test data
test.head()

Unnamed: 0,title,category
0,pom pom hair band rabbit light grey decor pom ...,
1,mariposa golf ball napkin weight,
2,mediterranean snack food roast garlic hummuz c...,
3,john deer gx genuin origin equip manufactur oe...,
4,protech wood cleanr gl perform coat inc fteccga,


In [67]:
# check train data dtypes, size, null counts
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20188 entries, 0 to 20187
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     20188 non-null  object
 1   category  20188 non-null  object
dtypes: object(2)
memory usage: 315.6+ KB


In [68]:
# check null counts in train data
train.isna().sum()

title       0
category    0
dtype: int64

In [69]:
# text preprocessing
def update_df_for_col_to_bow_array(df,col):
    import re
    import nltk
    nltk.download('stopwords')
    from nltk.corpus import stopwords
    from nltk.stem.porter import PorterStemmer
    corpus = []
    for i in range(len(df)):
        review = re.sub('[^a-zA-Z]', ' ', df[col][i])
        review = review.lower()
        review = review.split()
        ps = PorterStemmer()
        all_stopwords = stopwords.words('english')
        all_stopwords.remove('not')
        review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
        review = ' '.join(review)
        corpus.append(review)
    df[col]=corpus
    return df

In [70]:
# preprocess train data
train=update_df_for_col_to_bow_array(train,'title')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [71]:
# create BOW model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
# create X_train using BOW
X_train=cv.fit_transform(train.iloc[:,0]).toarray()

In [72]:
# label encoding
from sklearn.preprocessing import LabelEncoder
le= LabelEncoder()
y_train=le.fit_transform(train.iloc[:,1])

In [73]:
# naive bayes classifier 
from sklearn.naive_bayes import GaussianNB
clf=GaussianNB()
clf.fit(X_train,y_train)

GaussianNB()

In [74]:
# overview of test dataset
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10094 entries, 0 to 10093
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   title     10094 non-null  object 
 1   category  0 non-null      float64
dtypes: float64(1), object(1)
memory usage: 157.8+ KB


In [75]:
# null count in test dataset
test.isna().sum()

title           0
category    10094
dtype: int64

In [76]:
# test preprocessing on test dataset
test_a=update_df_for_col_to_bow_array(test,'title')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [77]:
test_a

Unnamed: 0,title,category
0,pom pom hair band rabbit light grey decor pom ...,
1,mariposa golf ball napkin weight,
2,mediterranean snack food roast garlic hummuz c...,
3,john deer gx genuin origin equip manufactur oe...,
4,protech wood cleanr gl perform coat inc fteccga,
...,...,...
10089,flexibl coupl quot x quot white,
10090,mr christma inch mini porcelain music box meta...,
10091,jewish new year shanah tovah rosh hashana card,
10092,hotel collect gridwork queen bedskirt graphit,


In [78]:
# create X_test using BOW
X_test=cv.transform(test_a['title']).toarray()

In [79]:
X_test.shape

(10094, 20142)

In [80]:
# predict categories for test data
y_pred=clf.predict(X_test)

In [81]:
# labels conversion from numerical to actual labels
y_pred=le.inverse_transform(y_pred)

In [82]:
# creating test copy to write as csv
pred=test.copy()

In [83]:
# attach predicted categories to predcted dataset
pred=pd.concat([test,pd.DataFrame(y_pred)],axis=1)[[0]]

In [84]:
# write predicted categories to csv
pred.to_csv('predictions.csv',header=['Category'],index_label='ID')

In [85]:
pred

Unnamed: 0,0
0,Home & Kitchen
1,
2,
3,Industrial & Scientific
4,
...,...
10089,Industrial & Scientific
10090,Home & Kitchen
10091,Home & Kitchen
10092,Home & Kitchen
