In [1]:
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
df = pd.read_csv('hamro.csv')

In [8]:
df

Unnamed: 0,Text,Class
0,"""Feeling lucky today?""",Gambling
1,"""Take a chance, you might win big!""",Gambling
2,"""Just one more bet won't hurt.""",Gambling
3,"""The thrill of the game is irresistible.""",Gambling
4,"""You used to be so good at this, why stop now?""",Gambling
...,...,...
5348,A comforting presence that brings peace.,Affection
5349,Cherishing moments that become cherished treas...,Affection
5350,Acts of devotion that define profound love.,Affection
5351,Supportive relationships that empower growth.,Affection


In [9]:
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if token.lemma_.isalnum() and not token.lemma_.isnumeric() and not token.is_punct and not token.is_space and not token.is_stop]
    return " ".join(tokens)

In [12]:
df['p_Text'] = df['Text'].apply(preprocess_text)

In [48]:
x = df['p_Text']
y = df['Class']
x


0                                 feel lucky today
1                                   chance win big
2                                    bet will hurt
3                         thrill game irresistible
4                                        good stop
                           ...                    
5348               comforting presence bring peace
5349             cherish moment cherished treasure
5350             act devotion define profound love
5351        supportive relationship empower growth
5352    heartfelt connection strengthen compassion
Name: p_Text, Length: 5353, dtype: object

In [47]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000)  # You can adjust the 

x_tfidf = tfidf_vectorizer.fit_transform(x)


In [18]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split


# Split the data
X_train, X_test, y_train, y_test = train_test_split(x_tfidf, y, test_size=0.2, random_state=42)

model = LinearSVC()

# # Fit the multi-output classifier
# model.fit(X_train, y_train)

# # Make predictions
# y_pred = classifier.predict(X_test)



In [20]:
#HyperParameter 
from sklearn.model_selection import GridSearchCV

# Example: Grid search for LinearSVC hyperparameters

param_grid = {'C': [0.1, 1, 10, 100]}
grid_search = GridSearchCV(LinearSVC(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_classifier = grid_search.best_estimator_




In [30]:
y_pred = best_classifier.predict(X_test)
X_test

<1071x8768 sparse matrix of type '<class 'numpy.float64'>'
	with 13745 stored elements in Compressed Sparse Row format>

In [22]:
report = classification_report(y_test, y_pred)

print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

   Affection       0.96      0.98      0.97       286
 Affirmation       0.92      0.90      0.91        61
  Depression       0.99      0.99      0.99       304
    Gambling       1.00      1.00      1.00        39
  HateSpeech       0.99      0.98      0.99       307
Mobile Games       1.00      1.00      1.00        17
   Political       1.00      0.98      0.99        57

    accuracy                           0.98      1071
   macro avg       0.98      0.98      0.98      1071
weighted avg       0.98      0.98      0.98      1071



In [23]:

from sklearn.metrics import confusion_matrix

In [24]:
confusion_matrix(y_pred,y_test)

array([[280,   6,   2,   0,   4,   0,   0],
       [  5,  55,   0,   0,   0,   0,   0],
       [  1,   0, 301,   0,   2,   0,   0],
       [  0,   0,   0,  39,   0,   0,   0],
       [  0,   0,   1,   0, 301,   0,   1],
       [  0,   0,   0,   0,   0,  17,   0],
       [  0,   0,   0,   0,   0,   0,  56]], dtype=int64)

In [61]:
text1 = "I love you"
text2 = "I hate you"

In [62]:
test = tfidf_vectorizer.fit_transform([text1, text2])
print(test)

  (0, 2)	0.5797386715376657
  (0, 1)	0.8148024746671689
  (1, 0)	0.8148024746671689
  (1, 2)	0.5797386715376657


In [64]:
best_classifier.predict(test)

ValueError: X has 3 features, but LinearSVC is expecting 8768 features as input.

In [50]:
X_train[0]

<1x8768 sparse matrix of type '<class 'numpy.float64'>'
	with 12 stored elements in Compressed Sparse Row format>

In [52]:
test[0]

<1x2 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [53]:
X_test[0]

<1x8768 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>