![image.png](attachment:122986e3-9079-45a6-861e-a7e1504de990.png)

### Importing libraries and getting the data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import string
from nltk.corpus import stopwords
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [None]:
website_df=pd.read_csv('../input/website-classification/website_classification.csv')
website_df.head()

In [None]:
print('Shape of dataframe:',website_df.shape)
print('Columns in dataframe:',website_df.columns)

In [None]:
website_df.drop('Unnamed: 0',axis=1,inplace=True)   #Dropping the unnecessary column
website_df.head()

In [None]:
website_df.isnull().sum()   #Checking whether there are any null values.

### Exploratory data analysis

#### Categories of websites in the dataset

In [None]:
df_a=website_df.groupby('Category').count().sort_values(by='Category',ascending=True)
df_a.index

In [None]:
df_a

In [None]:
sns.countplot(y=website_df['Category'],order=website_df['Category'].value_counts().index[:16])

From the plot it can be seen that the **top 5 website categories are:**
    
 1.Educational

 2.Business/Corporate
    
 3.Travel

 4.Streaming Services
    
 5.Sports

#### Wordcloud

In [None]:
#More frequently occuring words appear larger.


from wordcloud import WordCloud,STOPWORDS
content=''
stopwords_df=set(STOPWORDS)

for i in website_df['cleaned_website_text']:
    tokens=i.split(' ')
    
    content=content+' '.join(tokens)+' '

wordcloud_df=WordCloud(width=800,height=600,background_color='white',stopwords=stopwords_df).generate(content)
plt.figure(figsize=(8,8))
plt.axis('off')
plt.imshow(wordcloud_df)

#### Total length of words on all websites (combined) of different categories 

In [None]:
#Education
sum_ed=0
for i in website_df['Category'].index:
    if (website_df['Category'][i]=='Education'):
        a= len(website_df['cleaned_website_text'][i])
        sum_ed=sum_ed+a
print('Total number of words including all the educational websites: ', sum_ed)



#Business/Corporate
sum_business=0
for i in website_df['Category'].index:
    if (website_df['Category'][i]=='Business/Corporate'):
        a= len(website_df['cleaned_website_text'][i])
        sum_business=sum_business+a
print('Total number of words including all the Business/Corporate website: ', sum_business)


#Travel
sum_travel=0
for i in website_df['Category'].index:
    if (website_df['Category'][i]=='Travel'):
        a= len(website_df['cleaned_website_text'][i])
        sum_travel=sum_travel+a
print('Total number of words including all the travel websites: ', sum_travel)


#Streaming services
sum_stream=0
for i in website_df['Category'].index:
    if (website_df['Category'][i]=='Streaming Services'):
        a= len(website_df['cleaned_website_text'][i])
        sum_stream=sum_stream+a
print('Total number of words including all the Streaming services websites: ', sum_stream)


#Sports
sum_sports=0
for i in website_df['Category'].index:
    if (website_df['Category'][i]=='Sports'):
        a= len(website_df['cleaned_website_text'][i])
        sum_sports=sum_sports+a
print('Total number of words including all the sports websites: ', sum_sports)


#E-commerce
sum_commerce=0
for i in website_df['Category'].index:
    if (website_df['Category'][i]=='E-Commerce'):
        a= len(website_df['cleaned_website_text'][i])
        sum_commerce=sum_commerce+a
print('Total number of words including all the E-commerce website: ', sum_commerce)



#Games
sum_game=0
for i in website_df['Category'].index:
    if (website_df['Category'][i]=='Games'):
        a= len(website_df['cleaned_website_text'][i])
        sum_game=sum_game+a
print('Total number of words including all the games websites: ', sum_game)



#News
sum_news=0
for i in website_df['Category'].index:
    if (website_df['Category'][i]=='News'):
        a= len(website_df['cleaned_website_text'][i])
        sum_news=sum_news+a
print('Total number of words including all the News websites: ', sum_news)



#Health and Fitness
sum_health=0
for i in website_df['Category'].index:
    if (website_df['Category'][i]=='Health and Fitness'):
        a= len(website_df['cleaned_website_text'][i])
        sum_health=sum_health+a
print('Total number of words including all the Health and Fitness websites: ', sum_health)



#Computers and Technology
sum_comp=0
for i in website_df['Category'].index:
    if (website_df['Category'][i]=='Computers and Technology'):
        a= len(website_df['cleaned_website_text'][i])
        sum_comp=sum_comp+a
print('Total number of words including all the Computers and Technology website: ', sum_comp)



#Photography
sum_photog=0
for i in website_df['Category'].index:
    if (website_df['Category'][i]=='Photography'):
        a= len(website_df['cleaned_website_text'][i])
        sum_photog=sum_photog+a
print('Total number of words including all the Photography websites: ', sum_photog)



#Food
sum_food=0
for i in website_df['Category'].index:
    if (website_df['Category'][i]=='Food'):
        a= len(website_df['cleaned_website_text'][i])
        sum_food=sum_food+a
print('Total number of words including all the Food websites: ', sum_food)



#Law and Government
sum_law=0
for i in website_df['Category'].index:
    if (website_df['Category'][i]=='Law and Government'):
        a= len(website_df['cleaned_website_text'][i])
        sum_law=sum_law+a
print('Total number of words including all the Law and Government websites: ', sum_law)



#Social Networking and Messaging
sum_social=0
for i in website_df['Category'].index:
    if (website_df['Category'][i]=='Social Networking and Messaging'):
        a= len(website_df['cleaned_website_text'][i])
        sum_social=sum_social+a
print('Total number of words including all the Social Networking and Messaging website: ', sum_social)



#Adult
sum_ad=0
for i in website_df['Category'].index:
    if (website_df['Category'][i]=='Adult'):
        a= len(website_df['cleaned_website_text'][i])
        sum_ad=sum_ad+a
print('Total number of words including all the adult websites: ', sum_ad)



#Forums
sum_forum=0
for i in website_df['Category'].index:
    if (website_df['Category'][i]=='Forums'):
        a= len(website_df['cleaned_website_text'][i])
        sum_forum=sum_forum+a
print('Total number of words including all the Forums websites: ', sum_forum)


l=[sum_ed,sum_forum,sum_news,sum_sports,sum_health,sum_comp,sum_ad,sum_photog,sum_social,sum_law,sum_food,sum_game,sum_stream,sum_travel,sum_business,sum_commerce]
x1=pd.Series(l).to_frame()
x2=pd.Series(['Education','Business/Corporate','Travel','Streaming Services','Sports','E-Commerce','Games','News','Health & Fitness','Computer and Technology','Photography','Food','Law & Government','Social Neworking & Messaging','Adult','Forums']).to_frame()
x3=pd.concat([x2,x1],axis=1,ignore_index=True)
x3.columns=['Website category','Sum of length of all the words on the site']
x3.sort_values(by='Sum of length of all the words on the site',ascending=False)

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(20,6))
sns.barplot(x='Website category',y='Sum of length of all the words on the site',data=x3)
plt.xticks(rotation=75)
plt.title('Category wise total number of characters on all the websites (combined)',fontweight='bold',fontsize=15)

The dataframe x3 and the plot above indicate the **total number of characters that have been used to make all the the websites of a particular category** or in other words, **the total length of the words that are there in all the websites of a particular category.**


**For example**- All the 107 travel websites have used 1020091 characters in total. 

The **top 5 website categories that have used maximum number of characters** are:

1.Travel

2.Streaming Services

3.Education

4.Social Networing & Messaging

5.Sports


### Applying algorithm (Bag of Words model)

In [None]:
X=website_df['cleaned_website_text']      
y=website_df['Category']   

#### Removing stopwords

In [None]:
def text_process(value):
    nopunc=[char for char in value if char not in string.punctuation]   #Removes punctuation
    nopunc=''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]  #Removes stopwords

In [None]:
X.apply(text_process).head()    #Getting the cleaned text (head of the dataframe displayed here)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer 
X=CountVectorizer().fit_transform(X)

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer    
X=TfidfTransformer().fit_transform(X)               

### Naive Bayes classifier

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=101)

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
website_class=MultinomialNB()

In [None]:
website_class.fit(X_train,y_train)

In [None]:
predictions=website_class.predict(X_test)

In [None]:
predictions

In [None]:
print(classification_report(predictions,y_test))

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(confusion_matrix(predictions,y_test),cmap='YlGn_r',annot=True)

In [None]:
print(accuracy_score(predictions,y_test))

### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc=RandomForestClassifier()

In [None]:
rfc.fit(X_train,y_train)

In [None]:
predictions_rfc=rfc.predict(X_test)

In [None]:
predictions_rfc

In [None]:
print(classification_report(predictions_rfc,y_test))

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(confusion_matrix(predictions_rfc,y_test),cmap='PiYG',annot=True)

In [None]:
print(accuracy_score(predictions_rfc,y_test))

### Support Vector Machines

In [None]:
from sklearn.svm import SVC

In [None]:
svc=SVC()

In [None]:
svc.fit(X_train,y_train)

In [None]:
predictions_svc=svc.predict(X_test)

In [None]:
predictions

In [None]:
print(classification_report(predictions_svc,y_test))

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(confusion_matrix(predictions_rfc,y_test),cmap='YlOrBr',annot=True)

In [None]:
print(accuracy_score(predictions_svc,y_test))

#### It can be seen that Naive Bayes classifier gives maximum accuracy (approx. 88%)