In [3]:
import numpy as np
import pandas as pd
import csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
df = pd.read_csv('Twitter.csv', encoding='latin1', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [5]:
new_column_names = ['Category', 'ID', 'Date','Query', 'Username', 'Text']
df.columns = new_column_names
df.head()

Unnamed: 0,Category,ID,Date,Query,Username,Text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [6]:
columns_to_drop = [1, 2, 3, 4]
df = df.drop(df.columns[columns_to_drop], axis=1)

In [7]:
# Splitting the data into train, validation, and test sets

X = df['Text']
y = df['Category']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [12]:
print("X_train shape:", X_train.shape, "Y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape, "Y_test shape:", y_test.shape)
print("X_val shape:", X_val.shape, "Y_val shape:", y_val.shape)

X_train shape: (1120000,) Y_train shape: (1120000,)
X_test shape: (240000,) Y_test shape: (240000,)
X_val shape: (240000,) Y_val shape: (240000,)


In [13]:
# Saving DataFrames to CSV files

train_df = pd.DataFrame({'Text': X_train, 'Category': y_train})
val_df = pd.DataFrame({'Text': X_val, 'Category': y_val})
test_df = pd.DataFrame({'Text': X_test, 'Category': y_test})

train_df.to_csv('train.csv', index=False)
val_df.to_csv('validation.csv', index=False)
test_df.to_csv('test.csv', index=False)

In [14]:
# TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)

tfidf_vectorizer.fit(train_df['Text'])

# Transforming the text data into TF-IDF embeddings
X_train_tfidf = tfidf_vectorizer.transform(train_df['Text'])
X_val_tfidf = tfidf_vectorizer.transform(val_df['Text'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['Text'])

In [15]:
# Hyperparameter tuning

max_depth_values = [10, 15, 20]
min_samples_split_values = [5, 10, 15]

best_accuracy = 0
best_params = {}

for max_depth in max_depth_values:
    for min_samples_split in min_samples_split_values:
        # Initializing and training the decision tree classifier
        clf = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split)
        clf.fit(X_train_tfidf, y_train)

        # Predicting on validation set
        y_pred_val = clf.predict(X_val_tfidf)

        # Calculating accuracy
        accuracy = accuracy_score(y_val, y_pred_val)

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params['max_depth'] = max_depth
            best_params['min_samples_split'] = min_samples_split

print("Best hyperparameters:", best_params)
print("Best accuracy on validation set:", best_accuracy)

Best hyperparameters: {'max_depth': 20, 'min_samples_split': 5}
Best accuracy on validation set: 0.663925


In [16]:
# Training the classifier with best parameters on the entire training set

best_max_depth = best_params['max_depth']
best_min_samples_split = best_params['min_samples_split']

best_clf = DecisionTreeClassifier(max_depth=best_max_depth, min_samples_split=best_min_samples_split)

best_clf.fit(X_train_tfidf, y_train)

In [17]:
# Performance on the test data

# Predict on test set
y_pred_test = best_clf.predict(X_test_tfidf)

# Calculate accuracy on test set
accuracy_test = accuracy_score(y_test, y_pred_test)

print("Accuracy on test set using the best hyperparameters:", accuracy_test)

Accuracy on test set using the best hyperparameters: 0.6642291666666666
