In [1]:
import pandas as pd 
import numpy as np
import seaborn as sn
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier


In [2]:
dataset = pd.read_csv("c:/website_classification.csv" , low_memory=False) 
df = pd.DataFrame(dataset)
display(df)
print (df['Category'].value_counts(ascending=True))
print("Dataset size: " ,df.shape)

Unnamed: 0.1,Unnamed: 0,website_url,cleaned_website_text,Category
0,0,https://www.booking.com/index.html?aid=1743217,official site good hotel accommodation big sav...,Travel
1,1,https://travelsites.com/expedia/,expedia hotel book sites like use vacation wor...,Travel
2,2,https://travelsites.com/tripadvisor/,tripadvisor hotel book sites like previously d...,Travel
3,3,https://www.momondo.in/?ispredir=true,cheap flights search compare flights momondo f...,Travel
4,4,https://www.ebookers.com/?AFFCID=EBOOKERS-UK.n...,bot create free account create free account si...,Travel
...,...,...,...,...
1404,1387,http://www.electroshops.com/,electroshops home theater decor interiors seat...,Business/Corporate
1405,1388,http://www.cleanridge.com/,clean ridge soap company clean ridge soap comp...,Business/Corporate
1406,1389,http://www.creativepetgifts.com/,home page pet crafts exquisitely piece handcut...,Business/Corporate
1407,1390,http://www.htmarket.com/,home theater marketplace home theater seating ...,Business/Corporate


Forums                              16
Law and Government                  83
Social Networking and Messaging     83
Food                                91
Photography                         91
News                                92
Computers and Technology            93
Health and Fitness                  96
Games                               98
E-Commerce                         102
Streaming Services                 103
Sports                             103
Travel                             106
Business/Corporate                 109
Education                          110
Name: Category, dtype: int64
Dataset size:  (1409, 4)


In [3]:
#df.drop('Unnamed: 0',axis=1,inplace=True)
display(df)
print (df['Category'].value_counts(ascending=True))

df.replace([np.inf, -np.inf], np.nan, inplace=True) #replace infinity values with NaN
df.dropna(inplace=True) #dropping rows with missing values  
print("Dataset size after removal : " ,df.shape)

Unnamed: 0.1,Unnamed: 0,website_url,cleaned_website_text,Category
0,0,https://www.booking.com/index.html?aid=1743217,official site good hotel accommodation big sav...,Travel
1,1,https://travelsites.com/expedia/,expedia hotel book sites like use vacation wor...,Travel
2,2,https://travelsites.com/tripadvisor/,tripadvisor hotel book sites like previously d...,Travel
3,3,https://www.momondo.in/?ispredir=true,cheap flights search compare flights momondo f...,Travel
4,4,https://www.ebookers.com/?AFFCID=EBOOKERS-UK.n...,bot create free account create free account si...,Travel
...,...,...,...,...
1404,1387,http://www.electroshops.com/,electroshops home theater decor interiors seat...,Business/Corporate
1405,1388,http://www.cleanridge.com/,clean ridge soap company clean ridge soap comp...,Business/Corporate
1406,1389,http://www.creativepetgifts.com/,home page pet crafts exquisitely piece handcut...,Business/Corporate
1407,1390,http://www.htmarket.com/,home theater marketplace home theater seating ...,Business/Corporate


Forums                              16
Law and Government                  83
Social Networking and Messaging     83
Food                                91
Photography                         91
News                                92
Computers and Technology            93
Health and Fitness                  96
Games                               98
E-Commerce                         102
Streaming Services                 103
Sports                             103
Travel                             106
Business/Corporate                 109
Education                          110
Name: Category, dtype: int64
Dataset size after removal :  (1376, 4)


In [4]:
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import CountVectorizer 

x=df['cleaned_website_text']      
y=df['Category']  

x=CountVectorizer().fit_transform(x.apply(lambda x: np.str_(x)))
   
x=TfidfTransformer().fit_transform(x) 

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [12]:
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE
from collections import Counter
counter = Counter(y_train.shape) 
print('Before', counter) 
smtom = SMOTE() 
X_train_smtom, y_train_smtom = smtom.fit_resample (X_train, y_train)
counter = Counter(y_train_smtom) 
print('After', counter)
print("Dataset size after balance : " ,y_train_smtom.shape)

Before Counter({963: 1})
After Counter({'Business/Corporate': 80, 'Games': 80, 'Computers and Technology': 80, 'Social Networking and Messaging': 80, 'Photography': 80, 'Law and Government': 80, 'Education': 80, 'News': 80, 'E-Commerce': 80, 'Forums': 80, 'Food': 80, 'Streaming Services': 80, 'Sports': 80, 'Health and Fitness': 80, 'Travel': 80})
Dataset size after balance :  (1200,)


In [15]:
from sklearn.model_selection import GridSearchCV
 
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}
 
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3 ,n_jobs=-1)
 
# fitting the model for grid search
grid.fit(X_train_smtom,y_train_smtom)


# print best parameter after tuning
print(grid.best_params_)
 
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   29.0s
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:  2.8min finished


{'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
SVC(C=1000, gamma=0.001)


In [16]:
from sklearn.metrics import classification_report
from sklearn.svm import SVC
svc=SVC(C=1000,gamma=0.001,kernel="rbf")
svc.fit(X_train_smtom,y_train_smtom)
y_pred=svc.predict(X_test)

print(classification_report(y_test, y_pred))


                                 precision    recall  f1-score   support

             Business/Corporate       0.56      0.97      0.71        35
       Computers and Technology       0.82      0.77      0.79        30
                     E-Commerce       1.00      0.90      0.95        31
                      Education       0.82      0.93      0.87        30
                           Food       1.00      0.91      0.95        33
                         Forums       1.00      0.17      0.29         6
                          Games       0.94      0.94      0.94        31
             Health and Fitness       1.00      0.90      0.95        30
             Law and Government       0.93      0.93      0.93        27
                           News       1.00      0.96      0.98        27
                    Photography       1.00      0.72      0.84        18
Social Networking and Messaging       1.00      0.70      0.82        23
                         Sports       0.97      0.