In [None]:
!pip3 install wget

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9656 sha256=071a958d30925efde5455c489e45baa8ab5eee7d2e4d828427462f08cc8b6925
  Stored in directory: /root/.cache/pip/wheels/8b/f1/7f/5c94f0a7a505ca1c81cd1d9208ae2064675d97582078e6c769
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


Loading the Libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
import nltk
nltk.download("punkt")
nltk.download('wordnet')
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import string
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import joblib

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
#Loading Train, Validation and Test datasets

train_df = pd.read_csv('train.csv')
val_df = pd.read_csv('val.csv')
test_df = pd.read_csv('test.csv')

# Encode labels
label_encoder = LabelEncoder()
train_df['Category'] = label_encoder.fit_transform(train_df['Category'])
val_df['Category'] = label_encoder.transform(val_df['Category'])
test_df['Category'] = label_encoder.transform(test_df['Category'])

In [None]:
#Preprocessing

def preprocess_text(text):
  lemmatizer = WordNetLemmatizer() #initializing the wordnetlemmatizer object
  remove_punctuation_map = dict((ord(char), None) for char in string.punctuation) # a mapping dictionary that help remove punctuations
  lowered_doc = text.lower() #converting the text to lowercase
  no_punctuation = lowered_doc.translate(remove_punctuation_map)
  tokenized_doc = nltk.word_tokenize(no_punctuation) #tokenizing the document
  filtered_tokens = [token for token in tokenized_doc if not token in stopwords.words('english')] #removing stopwords
  filtered_tokens = [token for token in filtered_tokens if not token in string.punctuation] #removing punctuations

  lemmatized_text = []
  for word in filtered_tokens:
    lemmatized_text.append(lemmatizer.lemmatize(word)) #lemmatizing the word

  return " ".join(lemmatized_text)

In [None]:
train_df["Text"] = train_df["Text"].apply(preprocess_text)
train_df

Unnamed: 0,Text,Category
0,disnleyland isnt tha happiest place tha world ...,1
1,omg bellllyyy shoutin sum food brb soooo starv...,0
2,im still waiting find caca stand hoping think ...,0
3,so face,1
4,sad see drive away wat,0
...,...,...
1119995,song middle change doesnt want born arghhhh,0
1119996,officialnjonas good luck,1
1119997,proudgamertweet rather average 32370,0
1119998,pickin misstinayao waitin sadittysash 2 hurry ...,0


In [None]:
val_df["Text"] = val_df["Text"].apply(preprocess_text)
val_df

Unnamed: 0,Text,Category
0,would like something friend birthday,0
1,doggone got hour nap really want ponderosa chi...,0
2,bed good night twit,1
3,think im sick,0
4,ijustine follow justine,1
...,...,...
239995,rooftop party ln brooklyn hey least ln brookly...,1
239996,time school work week start okay think positiv...,0
239997,aureliustjin haha yeah name unique unless some...,1
239998,need money ticket,0


In [None]:
test_df["Text"] = test_df["Text"].apply(preprocess_text)
test_df

Unnamed: 0,Text,Category
0,dont want go work mood today,0
1,must stupid report childrens health,0
2,stuck work want sleep,0
3,djdimepiece hell id wifey hasnt left yet,0
4,kevinthompson know totally voice would great r...,1
...,...,...
239995,glennia show fun comment,1
239996,itskotepeople la cagã³ wn siento mi dedos,0
239997,wish move next year,0
239998,martagal language used undefined,1


In [None]:
# Loading GloVe embeddings
embedding_dim = 25
embeddings_index = {}

with open('glove.twitter.27B.25d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [None]:
# Creating embeddings for train, validation, and test data
def create_embeddings(text_data):
    embeddings = []
    for text in text_data:
        embedding = np.zeros(embedding_dim)
        words = text.split()
        for word in words:
            embedding += embeddings_index.get(word, np.zeros(embedding_dim))
        embeddings.append(embedding)
    return np.array(embeddings)

X_train_embeddings = create_embeddings(train_df['Text'])
X_val_embeddings = create_embeddings(val_df['Text'])
X_test_embeddings = create_embeddings(test_df['Text'])

In [None]:
# Initializing the Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Hyperparameter tuning
param_grid = {
    'n_estimators': [10, 50, 100],  # Number of trees in the forest
    'max_depth': [None, 5, 10],      # Maximum depth of the tree
}

# GridSearchCV
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_embeddings, train_df['Category'])

# Best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Saving the best model
joblib.dump(grid_search.best_estimator_, 'best_rf_model.pkl')
print("Best model saved as 'best_rf_model.pkl'")


Best Hyperparameters: {'max_depth': None, 'n_estimators': 100}
Best model saved as 'best_rf_model.pkl'


In [None]:
# Training model with best hyperparameters

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_embeddings, train_df['Category'])

In [None]:
# Accuracy on test dataset
y_test_pred_rf = rf_classifier.predict(X_test_embeddings)
test_accuracy = accuracy_score(test_df['Category'], y_test_pred_rf)
print("Test Accuracy:", test_accuracy)

Test Accuracy: 0.6950916666666667
