In [1]:
import spacy
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report


In [89]:
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if token.lemma_.isalnum() and not token.is_punct and not token.is_space and not token.is_stop]
    return " ".join(tokens)

data = pd.read_csv("test.csv")

In [90]:
p_data = data['tweet'].apply(preprocess_text)

In [91]:
data['p_data'] = p_data

In [93]:
x = data['p_data']
y = data['class']

In [94]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [95]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust the 

x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)

x_test_tfidf = tfidf_vectorizer.transform(x_test)


In [96]:
from sklearn.model_selection import GridSearchCV

# Example: Grid search for LinearSVC hyperparameters
param_grid = {'C': [0.1, 1, 10, 100]}
grid_search = GridSearchCV(LinearSVC(), param_grid, cv=5)
grid_search.fit(x_train_tfidf, y_train)
best_classifier = grid_search.best_estimator_




In [34]:
# Initialize the classifier
classifier = LinearSVC()

# Train the classifier on the training data
classifier.fit(x_train_tfidf, y_train)




In [97]:
# Make predictions on the testing data
y_pred = best_classifier.predict(x_test_tfidf)

# Evaluate the model using classification report
report = classification_report(y_test, y_pred)

print("Classification Report:\n", report)


Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.16      0.24       293
           1       0.91      0.96      0.93      3834
           2       0.85      0.81      0.83       840

    accuracy                           0.89      4967
   macro avg       0.76      0.65      0.67      4967
weighted avg       0.87      0.89      0.88      4967


In [98]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

rf_classifier = RandomForestClassifier()
nb_classifier = MultinomialNB()


In [99]:
rf_classifier.fit(x_train_tfidf, y_train)
y_pred = rf_classifier.predict(x_test_tfidf)

# Evaluate the model using classification report
report = classification_report(y_test, y_pred)

print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.46      0.15      0.23       293
           1       0.92      0.95      0.94      3834
           2       0.81      0.88      0.85       840

    accuracy                           0.89      4967
   macro avg       0.73      0.66      0.67      4967
weighted avg       0.88      0.89      0.88      4967


In [100]:
nb_classifier.fit(x_train_tfidf, y_train)
y_pred = nb_classifier.predict(x_test_tfidf)

# Evaluate the model using classification report
report = classification_report(y_test, y_pred)

print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.00      0.01       293
           1       0.83      0.99      0.90      3834
           2       0.92      0.41      0.57       840

    accuracy                           0.84      4967
   macro avg       0.92      0.47      0.49      4967
weighted avg       0.85      0.84      0.79      4967


In [102]:
from sklearn.utils.class_weight import compute_class_weight

# Calculate class weights
class_weights = compute_class_weight('balanced', classes=[0, 1, 2], y=y_train)

# Create a dictionary of class weights
class_weight_dict = {0: class_weights[0], 1: class_weights[1], 2: class_weights[2]}

## Use class weights when initializing the LinearSVC classifier
classifier = LinearSVC(class_weight=class_weight_dict)


In [44]:
classifier.fit(x_train_tfidf, y_train)



In [45]:
y_pred = classifier.predict(x_test_tfidf)

# Evaluate the model using classification report
report = classification_report(y_test, y_pred)

print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.34      0.39      0.36       290
           1       0.94      0.91      0.92      3832
           2       0.80      0.85      0.82       835

    accuracy                           0.87      4957
   macro avg       0.69      0.72      0.70      4957
weighted avg       0.88      0.87      0.87      4957


In [103]:
param_grid = {'C': [0.1, 1, 10, 100]}
grid_search = GridSearchCV(LinearSVC(class_weight=class_weight_dict), param_grid, cv=5)
grid_search.fit(x_train_tfidf, y_train)
best_classifier = grid_search.best_estimator_



In [104]:
# Make predictions on the testing data
y_pred = best_classifier.predict(x_test_tfidf)

# Evaluate the model using classification report
report = classification_report(y_test, y_pred)

print("Classification Report:\n", report)


Classification Report:
               precision    recall  f1-score   support

           0       0.43      0.49      0.46       293
           1       0.96      0.91      0.93      3834
           2       0.79      0.93      0.86       840

    accuracy                           0.89      4967
   macro avg       0.73      0.78      0.75      4967
weighted avg       0.90      0.89      0.89      4967


In [114]:
text = 'i love you darling'

test = tfidf_vectorizer.transform([text])

In [115]:
y_pred = best_classifier.predict(test)

In [116]:
y_pred

array([2])