In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import nltk
from nltk import TweetTokenizer
from nltk import PorterStemmer

import re 
import pickle
from prettytable import PrettyTable

import scipy.sparse as sp

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


In [2]:
#Removing unnamed columns which are not necessary
# sample_data=pd.read_csv('final_dataset.csv',encoding='latin1')
# sample_data.drop(sample_data.columns[sample_data.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
# sample_data.head(15)
# sample_data.to_csv("final_datasetv2.csv")

# Loading the Dataset

In [3]:
#Reading correct dataset
sample_data=pd.read_csv('final_datasetv2.csv',encoding='latin1')
sample_data.drop(sample_data.columns[sample_data.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
sample_data.head(15)

Unnamed: 0,url,description,verified,tweets,location,label
0,https://twitter.com/pokimanelol,content creator Ã¢ÂÂºÃ¯Â¸Â instagram.com/pok...,True,@Nihaachu the absolute cutiest patootiest Ã°Â...,cali Ã¢ÂÂÃ¯Â¸Â,Content Creator
1,https://twitter.com/neekolul,Partnered Spanglish @Twitch Streamer Ã°ÂÂÂ²Ã...,True,@sanbenito Benito en tejas todavÃÂ­a ahÃÂ­ s...,33333@brodinplett Sir u worked on the set of e...,Content Creator
2,https://twitter.com/ValorLeaks,Valorant Content Creator & Influencer | Valora...,False,Here's mine. https://t.co/CI9DCGH8fkWhat did y...,in a pitt,Content Creator
3,https://twitter.com/SuperSaf,Ã¢ÂÂ¶Ã¯Â¸Â Content Creator #Tech | #Travel |...,True,Looking forward to the #NextAtAcer Global Pres...,"Leicester, UK",Content Creator
4,https://twitter.com/NyanNyanners,horrible creature\n| mamas: @muryou_tada @Nia_...,True,@ironmouse @BubiVT Honestly true lmfao@_cherry...,United States,Content Creator
5,https://twitter.com/FurqanShayk,,True,"@AribaShahid @CareemPAK Yo safety first, even ...",copenhagen / islamabad,Content Creator
6,https://twitter.com/RiotCreatorSupp,North American Influencer Team @RiotGames supp...,True,. #RiotGrandPrix is back. Right now - see who ...,,Content Creator
7,https://twitter.com/HappyPower,"Content Creator for @MisfitsGG, News Reporter ...",True,OG Leaking Flash Back https://t.co/ujDj47AHsn@...,Use Code ALTMARZ In the Item Shop!,Content Creator
8,https://twitter.com/elgato,Stream. Record. Create. | Empowering content c...,True,@ItzNefarious Yes! \n\nÃ°ÂÂÂ https://t.co/u...,"Munich, Germany",Content Creator
9,https://twitter.com/Vikkstar123,Content Creator Ã¢ÂÂ¢ Business Contact: vikba...,True,@joinsideplus yeetEL SIDEMEN Ã°ÂÂ«Â¡ https://...,London,Content Creator


# Overlook to the dataset

In [4]:
#Return count of every row
sample_data.count()

url            5962
description    5945
verified       5962
tweets         5957
location       4896
label          5962
dtype: int64

In [5]:
# Dropping unnecessary columns
sample_data = sample_data.drop(['url', 'verified', 'location'], axis=1)
sample_data.head()

Unnamed: 0,description,tweets,label
0,content creator Ã¢ÂÂºÃ¯Â¸Â instagram.com/pok...,@Nihaachu the absolute cutiest patootiest Ã°Â...,Content Creator
1,Partnered Spanglish @Twitch Streamer Ã°ÂÂÂ²Ã...,@sanbenito Benito en tejas todavÃÂ­a ahÃÂ­ s...,Content Creator
2,Valorant Content Creator & Influencer | Valora...,Here's mine. https://t.co/CI9DCGH8fkWhat did y...,Content Creator
3,Ã¢ÂÂ¶Ã¯Â¸Â Content Creator #Tech | #Travel |...,Looking forward to the #NextAtAcer Global Pres...,Content Creator
4,horrible creature\n| mamas: @muryou_tada @Nia_...,@ironmouse @BubiVT Honestly true lmfao@_cherry...,Content Creator


In [6]:
# Check null Values
sample_data.isna().sum()

description    17
tweets          5
label           0
dtype: int64

In [7]:
# Dropping Null Values
sample_data.dropna(axis=0,inplace=True,)
sample_data = sample_data.reset_index(drop=True)

In [8]:
# Counting number of values
sample_data['label'].value_counts()

Sports             1027
Actor               989
Politician          985
Content Creator     980
Singer              980
Education           979
Name: label, dtype: int64

In [9]:
# Label encoding of the output labels
label = pd.DataFrame({
    "label": ["Content Creator", "Education", "Actor", "Politician", "Singer", "Sports"]
})

In [10]:
#copying the original dataset to manipulate data without intrupting original data
sample_data_output_encoded = sample_data.copy()

In [11]:
label_encoder = LabelEncoder()
label_encoder.fit(np.ravel(label))

LabelEncoder()

In [12]:
#Label Encoding
print("\n\nOutput Attribute After Label Encoding:")
print("========================================\n")
sample_data["Encoded_label"] = label_encoder.transform(
    sample_data['label'])
print(sample_data[["label", "Encoded_label"]])



Output Attribute After Label Encoding:

                label  Encoded_label
0     Content Creator              1
1     Content Creator              1
2     Content Creator              1
3     Content Creator              1
4     Content Creator              1
...               ...            ...
5935           Sports              5
5936           Sports              5
5937           Sports              5
5938           Sports              5
5939           Sports              5

[5940 rows x 2 columns]


In [13]:
columns = ['description', 'tweets']
for column in columns:
    def remove_pattern(column_data, pattern):
        processed_data = re.sub(pattern,"", column_data)
        return processed_data
    
    #Removing twitter handles
    sample_data["Processed "+column] = np.vectorize(remove_pattern)(sample_data[column], "@[\w]*")
    
    #removing punctuations
    sample_data["Processed "+column] = sample_data["Processed "+column].str.replace("[^a-zA-Z#\s]", "")

    #Removing short words
    sample_data["Processed "+column] = sample_data["Processed "+column].apply(
    lambda x: ' '.join([w for w in x.split() if len(w) > 3]))

    #Tokenization
    Tokenizer = TweetTokenizer()
    sample_data["Processed "+column] = sample_data["Processed "+column].apply(lambda x: Tokenizer.tokenize(str(x)))

    #Stemming
    ps = PorterStemmer()
    sample_data["Processed "+column] = sample_data["Processed "+column].apply(lambda string: [ps.stem(letter) for letter in string])
    
    #Stiching the tokens back
    for i in range(len(sample_data["Processed "+column])):
        sample_data["Processed "+column][i] = ' '.join(sample_data["Processed "+column][i])
sample_data.head(10)

Unnamed: 0,description,tweets,label,Encoded_label,Processed description,Processed tweets
0,content creator Ã¢ÂÂºÃ¯Â¸Â instagram.com/pok...,@Nihaachu the absolute cutiest patootiest Ã°Â...,Content Creator,1,content creator instagramcompokimanelol contac...,absolut cutiest patootiest stream beta year we...
1,Partnered Spanglish @Twitch Streamer Ã°ÂÂÂ²Ã...,@sanbenito Benito en tejas todavÃÂ­a ahÃÂ­ s...,Content Creator,1,partner spanglish streamer anim food videogam ...,benito teja todava hello qweenlt
2,Valorant Content Creator & Influencer | Valora...,Here's mine. https://t.co/CI9DCGH8fkWhat did y...,Content Creator,1,valor content creator influenc valor datamin i...,here mine httpstcocidcghfkwhat your night mark...
3,Ã¢ÂÂ¶Ã¯Â¸Â Content Creator #Tech | #Travel |...,Looking forward to the #NextAtAcer Global Pres...,Content Creator,1,content creator #tech #travel #meme podcast host,look forward #nextatac global press confer liv...
4,horrible creature\n| mamas: @muryou_tada @Nia_...,@ironmouse @BubiVT Honestly true lmfao@_cherry...,Content Creator,1,horribl creatur mama banner #nyanart content c...,honestli true lmfao thank much thi comm seriou...
5,North American Influencer Team @RiotGames supp...,. #RiotGrandPrix is back. Right now - see who ...,Content Creator,1,north american influenc team support content c...,#riotgrandprix back right win bracket master d...
6,"Content Creator for @MisfitsGG, News Reporter ...",OG Leaking Flash Back https://t.co/ujDj47AHsn@...,Content Creator,1,content creator new report content creator #st...,leak flash back httpstcoujdjahsn want fortnit ...
7,Stream. Record. Create. | Empowering content c...,@ItzNefarious Yes! \n\nÃ°ÂÂÂ https://t.co/u...,Content Creator,1,stream record creat empow content creator sinc...,httpstcouaejpsmye httpstcoklifmleei enhanc her...
8,Content Creator Ã¢ÂÂ¢ Business Contact: vikba...,@joinsideplus yeetEL SIDEMEN Ã°ÂÂ«Â¡ https://...,Content Creator,1,content creator busi contact vikbarncom instag...,yeetel sidemen httpstcoonshewhgleav like https...
9,The Largest Event In The US Dedicated To Love ...,New EXXXOTICA Blog! Breaking Into The Biz Ã¢Â...,Content Creator,1,largest event dedic love wildli illprepar onli...,exxxotica blog break into howto guid get into ...


In [14]:
X = sample_data.iloc[:, 4:]
X

Unnamed: 0,Processed description,Processed tweets
0,content creator instagramcompokimanelol contac...,absolut cutiest patootiest stream beta year we...
1,partner spanglish streamer anim food videogam ...,benito teja todava hello qweenlt
2,valor content creator influenc valor datamin i...,here mine httpstcocidcghfkwhat your night mark...
3,content creator #tech #travel #meme podcast host,look forward #nextatac global press confer liv...
4,horribl creatur mama banner #nyanart content c...,honestli true lmfao thank much thi comm seriou...
...,...,...
5935,tenista tenni player facebookcomgustavokuerten...,tamo chegando vamooo httpstcomulpmkngjonova ca...
5936,tenni player instagram contacto juancom,httpstcodhpagsfr httpstcokedoekbi httpstcoegig...
5937,tenni player,httpstcolgiuhcaypwno palabra httpstcozkmexvcha...
5938,tenni player olymp gold medalist silver medali...,httpstcoutjywmse soon httpstcofbmtnvmaf httpst...


In [15]:
Y = sample_data.iloc[:, 3]
Y

0       1
1       1
2       1
3       1
4       1
       ..
5935    5
5936    5
5937    5
5938    5
5939    5
Name: Encoded_label, Length: 5940, dtype: int32

## First Approach by using training BOW and TF-IDF on two separate columns and then stacking the results of both 

## Bag-of-word Features

In [16]:
description_vectorizer = CountVectorizer(max_df=0.90,
                                   min_df=2,
                                   max_features=1000,
                                   stop_words='english')
tweets_vectorizer = CountVectorizer(max_df=0.90,
                                   min_df=2,
                                   max_features=1000,
                                   stop_words='english')

In [17]:
description_vectors = description_vectorizer.fit_transform(X['Processed description'])
tweets_vectors = tweets_vectorizer.fit_transform(X['Processed tweets'])

In [18]:
combined_2 = sp.hstack([description_vectors, tweets_vectors], format='csr')
combined_2

<5940x2000 sparse matrix of type '<class 'numpy.int64'>'
	with 210170 stored elements in Compressed Sparse Row format>

In [19]:
pd.DataFrame(combined_2.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5935,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5936,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5937,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5938,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [20]:
train_bow = combined_2
train_bow.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [21]:
x_train_bow, x_valid_bow, y_train_bow, y_valid_bow = train_test_split(train_bow,Y,test_size=0.3,random_state=2)

# Functions to save, load model and make predictions on test data

In [22]:
def save_model(classifier, model_name):
    pickle.dump(classifier, open(model_name+'_trained_model.pkl', 'wb'))
def load_model(model_name):
    model = pickle.load(open(model_name+'_trained_model.pkl', 'rb'))
    return model
def predictions(model, x_valid, y_valid):
    model_predictions = model.predict(x_valid)
    model_prediction_df = pd.DataFrame(y_valid)
    model_prediction_df['Predicted'] = model_predictions
    evaluation(model_prediction_df["Encoded_label"],model_prediction_df["Predicted"])

# Evaluation Measures

In [23]:
def evaluation(ytest, pred):
    accuracy = accuracy_score(ytest, pred)
    precision = precision_score(ytest, pred, average='macro')
    recall = recall_score(ytest, pred, average='macro')
    f1score = f1_score(ytest, pred, average='macro')
    precision = "{:.1%}".format(precision) 
    b= PrettyTable() 
    b.field_names = ["Average_Type","Accuracy", "Precision", "Recall", "F1 score"]
    b.add_row(["Macro",round(accuracy,3), precision ,round(recall,3),round(f1score,3)])
    precision = precision_score(ytest, pred, average='weighted')
    recall = recall_score(ytest, pred, average='weighted')
    f1score = f1_score(ytest, pred, average='weighted')
    precision = "{:.1%}".format(precision) 
    b.add_row(["Weighted",round(accuracy,3), precision ,round(recall,3),round(f1score,3)])
    print("\n\nEvaluation Scores:")
    print("==================\n")
    print(b)

# Naive Bayes Model Training with BOW Features

In [24]:
nb_model = GaussianNB()
nb_model.fit(x_train_bow.toarray(),y_train_bow)

GaussianNB()

In [25]:
save_model(nb_model, 'nb')
model = load_model("nb")
predictions(model, x_valid_bow.toarray(), y_valid_bow)



Evaluation Scores:

+--------------+----------+-----------+--------+----------+
| Average_Type | Accuracy | Precision | Recall | F1 score |
+--------------+----------+-----------+--------+----------+
|    Macro     |  0.557   |   55.5%   | 0.553  |  0.544   |
|   Weighted   |  0.557   |   55.7%   | 0.557  |  0.547   |
+--------------+----------+-----------+--------+----------+


# SVM Model Training with BOW Features

In [26]:
svc_model = svm.SVC()
svc_model.fit(x_train_bow,y_train_bow)

SVC()

In [27]:
save_model(svc_model, 'svc')
model = load_model("svc")
predictions(model, x_valid_bow, y_valid_bow)



Evaluation Scores:

+--------------+----------+-----------+--------+----------+
| Average_Type | Accuracy | Precision | Recall | F1 score |
+--------------+----------+-----------+--------+----------+
|    Macro     |  0.815   |   81.8%   | 0.814  |  0.814   |
|   Weighted   |  0.815   |   82.0%   | 0.815  |  0.816   |
+--------------+----------+-----------+--------+----------+


# Logistic Regression Training with BOW Features

In [28]:
lr_model = LogisticRegression() 
lr_model.fit(x_train_bow, y_train_bow)

LogisticRegression()

In [29]:
save_model(lr_model, 'lr')
model = load_model("lr")
predictions(model, x_valid_bow, y_valid_bow)



Evaluation Scores:

+--------------+----------+-----------+--------+----------+
| Average_Type | Accuracy | Precision | Recall | F1 score |
+--------------+----------+-----------+--------+----------+
|    Macro     |  0.858   |   85.8%   | 0.857  |  0.857   |
|   Weighted   |  0.858   |   86.0%   | 0.858  |  0.859   |
+--------------+----------+-----------+--------+----------+


# Random Forest Training with BOW Features

In [30]:
RF_model = RandomForestClassifier()
RF_model = RF_model.fit(x_train_bow,y_train_bow)
print(RF_model)

RandomForestClassifier()


In [31]:
save_model(RF_model, 'RF')
model = load_model("RF")
predictions(model, x_valid_bow, y_valid_bow)



Evaluation Scores:

+--------------+----------+-----------+--------+----------+
| Average_Type | Accuracy | Precision | Recall | F1 score |
+--------------+----------+-----------+--------+----------+
|    Macro     |  0.882   |   88.1%   | 0.881  |  0.881   |
|   Weighted   |  0.882   |   88.2%   | 0.882  |  0.882   |
+--------------+----------+-----------+--------+----------+


## TF-IDF followed bt bag-of-words features

In [32]:
description_tfidf_vectorizer = TfidfVectorizer(max_df=0.90,
                                   min_df=2,
                                   stop_words='english')
tweets_tfidf_vectorizer = TfidfVectorizer(max_df=0.90,
                                   min_df=2,
                                   stop_words='english')

In [33]:
description_tfidf_vectors = description_tfidf_vectorizer.fit_transform(X['Processed description'])
tweets_tfidf_vectors = tweets_tfidf_vectorizer.fit_transform(X['Processed tweets'])

In [34]:
combined_tfidf = sp.hstack([description_tfidf_vectors, tweets_tfidf_vectors], format='csr')
combined_tfidf

<5940x20712 sparse matrix of type '<class 'numpy.float64'>'
	with 325191 stored elements in Compressed Sparse Row format>

In [35]:
train_tfidf_matrix = combined_tfidf
train_tfidf_matrix.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [36]:
x_train_tfidf, x_valid_tfidf, y_train_tfidf, y_valid_tfidf = train_test_split(train_tfidf_matrix,Y,test_size=0.3,random_state=2)

In [47]:
x_valid_tfidf

<1782x20712 sparse matrix of type '<class 'numpy.float64'>'
	with 98555 stored elements in Compressed Sparse Row format>

# SVM Model Training using CountVectorizer followed by TfidfTransformer
 - This is done by using a scikit-learn function named as TfidfVectorizer() that apply both of them at the same time.

In [40]:
svc_tfidf_model = svm.SVC()
svc_tfidf_model.fit(x_train_tfidf,y_train_tfidf)

SVC()

In [41]:
save_model(svc_tfidf_model, 'svc_tfidf')
model = load_model("svc_tfidf")
predictions(model, x_valid_tfidf, y_valid_tfidf)



Evaluation Scores:

+--------------+----------+-----------+--------+----------+
| Average_Type | Accuracy | Precision | Recall | F1 score |
+--------------+----------+-----------+--------+----------+
|    Macro     |  0.897   |   89.8%   | 0.897  |  0.897   |
|   Weighted   |  0.897   |   89.9%   | 0.897  |  0.898   |
+--------------+----------+-----------+--------+----------+


# Logistic Regression Model Training using CountVectorizer followed by TfidfTransformer

In [42]:
lr_tfidf_model = LogisticRegression() 
lr_tfidf_model.fit(x_train_tfidf, y_train_tfidf)

LogisticRegression()

In [43]:
save_model(lr_tfidf_model, 'lr_tfidf')
model = load_model("lr_tfidf")
predictions(model, x_valid_tfidf, y_valid_tfidf)



Evaluation Scores:

+--------------+----------+-----------+--------+----------+
| Average_Type | Accuracy | Precision | Recall | F1 score |
+--------------+----------+-----------+--------+----------+
|    Macro     |  0.899   |   89.8%   | 0.898  |  0.898   |
|   Weighted   |  0.899   |   89.9%   | 0.899  |  0.899   |
+--------------+----------+-----------+--------+----------+


# Random Model Training using CountVectorizer followed by TfidfTransformer

In [44]:
RF_tfidf_model = RandomForestClassifier()
RF_tfidf_model = RF_model.fit(x_train_tfidf,y_train_tfidf)
print(RF_model)

RandomForestClassifier()


In [45]:
save_model(RF_model, 'RF_tfidf')
model = load_model("RF_tfidf")
predictions(model, x_valid_tfidf, y_valid_tfidf)



Evaluation Scores:

+--------------+----------+-----------+--------+----------+
| Average_Type | Accuracy | Precision | Recall | F1 score |
+--------------+----------+-----------+--------+----------+
|    Macro     |  0.868   |   86.6%   | 0.866  |  0.865   |
|   Weighted   |  0.868   |   86.7%   | 0.868  |  0.867   |
+--------------+----------+-----------+--------+----------+


# CNN Model using BOW Features

In [None]:
# from keras.models import Sequential
# from keras import layers

# input_dim = x_train_bow.shape[1]  # Number of features

# model = Sequential()
# model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
# model.add(layers.Dense(1, activation='sigmoid'))

# CNN link
- https://realpython.com/python-keras-text-classification/#choosing-a-data-set