# Importing libraries
--------------

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Reading Data
------------------

In [2]:
df = pd.read_csv("set_2.csv")

In [3]:
df.head()

Unnamed: 0,category,description
0,Clothing,key features of alisha solid women cycling sho...
1,Furniture,fabhomedecor fabric double sofa bed finish col...
2,Footwear,key features of aw bellies sandals wedges heel...
3,Clothing,key features of alisha solid women cycling sho...
4,Pet Supplies,specifications of sicons all purpose arnica do...


# Data Preparation
----------

#### Chaning the nan with whitespaces as nan is considered as a float in numpy which hinders with out further processing

In [4]:
df['description'] = df['description'].fillna('').astype(str)

#### Making the test train split at 75:25 ratio

In [5]:
training_data, testing_data = train_test_split(df,random_state = 2000)

In [6]:
training_data['description']

6608     flipkart com buy ski fancy school art plastic ...
8567     timewel analog watch for men buy timewel analo...
18395    specifications of ucb baby boy shirt general d...
14820    nutcase sticker wrap design football love ml b...
15820    get glamr designer brogue boots buy get glamr ...
                               ...                        
4380     wow crystal necklace buy wow crystal necklace ...
13947    nimya solid men polo neck shirt buy maroon bla...
1590     adhira alloy jewel set buy adhira alloy jewel ...
4045     buy allure auto cm car mat tata nano for rs on...
4936     buy pratami silk solid blouse material for rs ...
Name: description, Length: 15000, dtype: object

In [7]:
Y_train = training_data['category'].values
Y_test = testing_data['category'].values

### Making a vocabulary out of our description coloumn using the tfid vectorizer as our method of feature weighting

In [17]:
def tdif_ext(training_data, testing_data):
    vect = TfidfVectorizer(use_idf=True, max_df=0.95)
    vect.fit_transform(training_data["description"].values)
        
    train_set = vect.transform(training_data["description"].values)
    test_set = vect.transform(testing_data["description"].values)
        
    return train_set, test_set, vect

In [31]:
X_train, X_test, transformer = tdif_ext(training_data,testing_data)#,type=feature_rep)

# Modelling and Evaluation
-----------------

In [28]:
model = LogisticRegression(solver='liblinear',random_state=0, C=5, penalty='l2').fit(X_train, Y_train)

In [29]:
model.score(X_test, Y_test)

0.9614

### The model returns a score of 96.14 percent.

### Testing it accros a few entries to see the actual vs predicted labels

In [21]:
for i in range(16999,17060):
    print(df["category"][i], end=" ")
    print(model.predict(transformer.transform([df["description"][i]])))

Home Furnishing  ['Home Furnishing ']
Home Furnishing  ['Home Furnishing ']
Home Furnishing  ['Home Furnishing ']
Home Furnishing  ['Home Furnishing ']
Mobiles & Accessories  ['Mobiles & Accessories ']
Mobiles & Accessories  ['Mobiles & Accessories ']
Mobiles & Accessories  ['Mobiles & Accessories ']
Mobiles & Accessories  ['Mobiles & Accessories ']
Jewellery  ['Jewellery ']
Jewellery  ['Jewellery ']
Mobiles & Accessories  ['Mobiles & Accessories ']
Jewellery  ['Jewellery ']
Mobiles & Accessories  ['Mobiles & Accessories ']
Mobiles & Accessories  ['Mobiles & Accessories ']
Mobiles & Accessories  ['Mobiles & Accessories ']
Mobiles & Accessories  ['Mobiles & Accessories ']
Mobiles & Accessories  ['Mobiles & Accessories ']
Mobiles & Accessories  ['Mobiles & Accessories ']
Mobiles & Accessories  ['Mobiles & Accessories ']
Mobiles & Accessories  ['Mobiles & Accessories ']
Mobiles & Accessories  ['Mobiles & Accessories ']
Mobiles & Accessories  ['Mobiles & Accessories ']
Mobiles & Accessorie

### Testing it with a new random entry copied from a Tshirt description from flipkart.com

In [32]:
print(model.predict(transformer.transform(["High quality Printed Hoodie Full sleeves Tshirt direct from the manufacturers. 100% Pure combed 155 - 160 GSM Cotton used. Gives you perfect fit, comfort feel and handsome look. Trusted brand online and no compromise on quality"])))

['Clothing ']
