In [1]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import numpy as np
import csv
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
#test file
path = "/content/drive/MyDrive/Datasets/test.csv"
df_test = pd.read_csv(path, escapechar = "\\", quoting = csv.QUOTE_NONE)
df_test.head()

Unnamed: 0,PRODUCT_ID,TITLE,DESCRIPTION,BULLET_POINTS,BRAND
0,1,"Command 3M Small Kitchen Hooks, White, Decorat...",Sale Unit: PACK,[INCLUDES - 9 hooks and 12 small indoor strips...,Command
1,2,O'Neal Jump Hardware JAG Unisex-Adult Glove (B...,Synthetic leather palm with double-layer thumb...,[Silicone printing for a better grip. Long las...,O'Neal
2,3,"NFL Detroit Lions Portable Party Fridge, 15.8 ...",Boelter Brands lets you celebrate your favorit...,[Runs on 12 Volt DC Power or 110 Volt AC Power...,Boelter Brands
3,4,Panasonic Single Line KX-TS880MX Corded Phone ...,Features: 50 Station Phonebook Corded Phone Al...,Panasonic Landline Phones doesn't come with a ...,Panasonic
4,5,Zero Baby Girl's 100% Cotton Innerwear Bloomer...,"Zero Baby Girl Panties Set. 100% Cotton, Breat...","[Zero Baby Girl Panties, Pack of 6, 100% Cotto...",Zero


In [3]:
#train file
path = "/content/drive/MyDrive/Datasets/train.csv"
df_train = pd.read_csv(path, escapechar = "\\", quoting = csv.QUOTE_NONE)
df_train = df_train.dropna()
df_train.head()

Unnamed: 0,TITLE,DESCRIPTION,BULLET_POINTS,BRAND,BROWSE_NODE_ID
0,"Pete The Cat Bedtime Blues Doll, 14.5 Inch","Pete the Cat is the coolest, most popular cat ...","[Pete the Cat Bedtime Blues plush doll,Based o...",MerryMakers,0
1,"The New Yorker NYHM014 Refrigerator Magnet, 2 ...",The New Yorker Handsome Cello Wrapped Hard Mag...,[Cat In A Tea Cup by New Yorker cover artist G...,The New Yorker,1
5,Men'S Full Sleeve Raglan T-Shirts Denim T-Shir...,Men'S Full Sleeve Raglan T-Shirts Denim T-Shir...,"[Color: Blue,Sleeve: Full Sleeve,Material: Cot...",Bhavya Enterprise,5
6,Glance Women's Wallet (Black) (LW-21),This Black wallet by Glance will be a treasure...,[The Most Comfortable Women's Wallet That You ...,Glance,6
7,Wild Animals Hungry Brain Educational Flash Ca...,Wild Animals are the animals that mostly stays...,[Playful learning: Flash cards develops the lo...,hungry brain,7


In [4]:
punctuation_signs = list("?:!.,;")
nltk.download('punkt')
nltk.download('wordnet')
wordnet_lemmatizer = WordNetLemmatizer()
nltk.download('stopwords')
stop_words = list(stopwords.words('english'))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
df_train['Title'] = df_train['TITLE'].str.replace("\r", " ")
df_train['Title'] = df_train['Title'].str.replace("\n", " ")
df_train['Title'] = df_train['Title'].str.replace("    ", " ")
df_train['Title'] = df_train['Title'].str.replace('"', '')
df_train['Title'] = df_train['Title'].str.lower()
for punct_sign in punctuation_signs:
  df_train['Title'] = df_train['Title'].str.replace(punct_sign, '')
df_train['Title'] = df_train['Title'].str.replace("'s", "")

In [6]:
final_cols = ["Title", "BROWSE_NODE_ID"]
df_train = df_train[final_cols]
df_train = df_train.iloc[:35000, :]

In [7]:
df_test['Title'] = df_test['TITLE'].str.replace("\r", " ")
df_test['Title'] = df_test['Title'].str.replace("\n", " ")
df_test['Title'] = df_test['Title'].str.replace("    ", " ")
df_test['Title'] = df_test['Title'].str.replace('"', '')
df_test['Title'] = df_test['Title'].str.lower()
for punct_sign in punctuation_signs:
  df_test['Title'] = df_test['Title'].str.replace(punct_sign, '')
df_test['Title'] = df_test['Title'].str.replace("'s", "")

In [8]:
final_cols = ["Title", "PRODUCT_ID"]
df_test = df_test[final_cols]

In [9]:
df_train.head()

Unnamed: 0,Title,BROWSE_NODE_ID
0,pete the cat bedtime blues doll 145 inch,0
1,the new yorker nyhm014 refrigerator magnet 2 x 35,1
5,men full sleeve raglan t-shirts denim t-shirt ...,5
6,glance women wallet (black) (lw-21),6
7,wild animals hungry brain educational flash ca...,7


In [10]:
df_test.head()

Unnamed: 0,Title,PRODUCT_ID
0,command 3m small kitchen hooks white decorate ...,1
1,o'neal jump hardware jag unisex-adult glove (b...,2
2,nfl detroit lions portable party fridge 158 quart,3
3,panasonic single line kx-ts880mx corded phone ...,4
4,zero baby girl 100% cotton innerwear bloomer d...,5


In [11]:
df_train.isna().sum()

Title             0
BROWSE_NODE_ID    0
dtype: int64

In [12]:
df_test["Title"].fillna("No Data", inplace = True)

In [13]:
df_test.isna().sum()

Title         0
PRODUCT_ID    0
dtype: int64

In [14]:
X_train, X_test, y_train = df_train["Title"], df_test["Title"], df_train["BROWSE_NODE_ID"]

In [15]:
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,2), min_df=5)
print('1')
X_train_vectors_tfidf = tfidf.fit_transform(X_train)
print(X_train_vectors_tfidf.shape)
print('1')
X_test_vectors_tfidf = tfidf.transform(X_test)
print(X_test_vectors_tfidf.shape)

1
(35000, 16942)
1
(110775, 16942)


In [16]:
lr_tfidf=LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')
lr_tfidf.fit(X_train_vectors_tfidf, y_train)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
y_predict = lr_tfidf.predict(X_test_vectors_tfidf)

In [18]:
y_predict

array([1140, 1045, 8269, ...,  800,  800,  800])

In [19]:
len(y_predict)

110775

In [20]:
d = {"BROWSE_NODE_ID" : y_predict}

In [21]:
df1 = pd.DataFrame(data=d)
df1

Unnamed: 0,BROWSE_NODE_ID
0,1140
1,1045
2,8269
3,125
4,1922
...,...
110770,4368
110771,1551
110772,800
110773,800


In [22]:
df_new = pd.concat([df_test, df1], axis = 1)
df_new.head()

Unnamed: 0,Title,PRODUCT_ID,BROWSE_NODE_ID
0,command 3m small kitchen hooks white decorate ...,1,1140
1,o'neal jump hardware jag unisex-adult glove (b...,2,1045
2,nfl detroit lions portable party fridge 158 quart,3,8269
3,panasonic single line kx-ts880mx corded phone ...,4,125
4,zero baby girl 100% cotton innerwear bloomer d...,5,1922


In [23]:
l = ["PRODUCT_ID", "BROWSE_NODE_ID"]
df_new = df_new[l]
df_new

Unnamed: 0,PRODUCT_ID,BROWSE_NODE_ID
0,1,1140
1,2,1045
2,3,8269
3,4,125
4,5,1922
...,...,...
110770,110771,4368
110771,110772,1551
110772,110773,800
110773,110774,800


In [24]:
df_new.to_csv("/content/drive/MyDrive/Datasets/submission.csv", index = False, header = True)

In [25]:
path = "/content/drive/MyDrive/Datasets/submission.csv"
df_sub = pd.read_csv(path)
df_sub.head()

Unnamed: 0,PRODUCT_ID,BROWSE_NODE_ID
0,1,1140
1,2,1045
2,3,8269
3,4,125
4,5,1922


In [26]:
df_sub.isna().sum()

PRODUCT_ID        0
BROWSE_NODE_ID    0
dtype: int64

In [27]:
df_sub.describe()

Unnamed: 0,PRODUCT_ID,BROWSE_NODE_ID
count,110775.0,110775.0
mean,55388.0,1814.854317
std,31978.132372,2984.057629
min,1.0,0.0
25%,27694.5,507.0
50%,55388.0,1045.0
75%,83081.5,1687.0
max,110775.0,47970.0
