In [23]:
import pandas as pd 
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df= pd.read_csv(r"C:\Users\Rohan\Pictures\rohan\NLP\text.csv")

In [3]:
df.shape

(416809, 3)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,0,i just feel really helpless and heavy hearted,4
1,1,ive enjoyed being able to slouch about relax a...,0
2,2,i gave up my internship with the dmrg and am f...,4
3,3,i dont know i feel so lost,0
4,4,i am a kindergarten teacher and i am thoroughl...,4


In [5]:
df.drop('Unnamed: 0',axis=1,inplace=True)

In [24]:
def preprocess_text(text):
    text = re.sub(r'<[^>]*>|[&?]', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalnum() or word in ['!', '?']]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

df['text'] = df['text'].apply(preprocess_text)

In [6]:
df.shape

(416809, 2)

In [7]:
df.label.value_counts()

label
1    141067
0    121187
3     57317
4     47712
2     34554
5     14972
Name: count, dtype: int64

In [17]:
from imblearn.under_sampling import RandomUnderSampler

X = df.drop(columns=['label'])  
y = df['label']  
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

In [30]:
y_resampled.shape

(89832,)

In [27]:
X_resampled

Unnamed: 0,text
133243,ive learned surround woman lift leave feeling ...
88501,already feel crappy upset situation doesnt help
131379,feel like lost mourned moved past tear relatio...
148369,could write whole lot im feeling crappy dont t...
134438,always seem feel inadequate
...,...
416753,feel like stunned
416762,feel like muscle around eye something funny go...
416799,feel must confess even though kill say admit i...
416806,feel curious previous early dawn time seek tro...


In [25]:
X_resampled['text'] = X_resampled['text'].apply(preprocess_text)

In [31]:
X_resampled['text'].shape

(89832,)

without use resampling

In [33]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df.label, test_size=0.2, random_state=42)

tfidf_vectorizer = TfidfVectorizer(max_features=43000)  # Adjust max_features as needed
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

y_train_pred = model.predict(X_train_tfidf)
y_test_pred = model.predict(X_test_tfidf)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)

print("Classification Report for Testing Set:")
print(classification_report(y_test, y_test_pred))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training Accuracy: 0.9184188191826587
Testing Accuracy: 0.9001223579088794
Classification Report for Testing Set:
              precision    recall  f1-score   support

           0       0.94      0.94      0.94     24201
           1       0.91      0.93      0.92     28164
           2       0.81      0.76      0.79      6929
           3       0.90      0.90      0.90     11441
           4       0.85      0.85      0.85      9594
           5       0.79      0.70      0.74      3033

    accuracy                           0.90     83362
   macro avg       0.87      0.85      0.86     83362
weighted avg       0.90      0.90      0.90     83362



with use resampling

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled['text'], y_resampled, test_size=0.2, random_state=42)

tfidf_vectorizer = TfidfVectorizer(max_features=43000)  # Adjust max_features as needed
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

y_train_pred = model.predict(X_train_tfidf)
y_test_pred = model.predict(X_test_tfidf)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)

print("Classification Report for Testing Set:")
print(classification_report(y_test, y_test_pred))


Training Accuracy: 0.9396507340151673
Testing Accuracy: 0.9113931095898036
Classification Report for Testing Set:
              precision    recall  f1-score   support

           0       0.94      0.91      0.92      2980
           1       0.92      0.87      0.90      2961
           2       0.91      0.96      0.93      3115
           3       0.92      0.93      0.92      2981
           4       0.91      0.85      0.88      3094
           5       0.88      0.96      0.92      2836

    accuracy                           0.91     17967
   macro avg       0.91      0.91      0.91     17967
weighted avg       0.91      0.91      0.91     17967



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [39]:
def predict_sentiment(user_input, tfidf_vectorizer, model):
    preprocessed_input = preprocess_text(user_input)
    input_tfidf = tfidf_vectorizer.transform([preprocessed_input])
    user_prediction = model.predict(input_tfidf)[0]
    return user_prediction

user_input = 'gave internship dmrg feeling distraught'
predicted_sentiment = predict_sentiment(user_input, tfidf_vectorizer, model)
print("Predicted Sentiment:", predicted_sentiment)

Predicted Sentiment: 4


In [42]:
df.text[2]

'gave internship dmrg feeling distraught'

In [43]:
df.label[2]

4

In [45]:
df.text[5]

'beginning feel quite disheartened'

In [46]:
df.label[5]

0

In [47]:
user_input ='beginning feel quite disheartened'
predicted_sentiment = predict_sentiment(user_input, tfidf_vectorizer, model)
print("Predicted Sentiment:", predicted_sentiment)

Predicted Sentiment: 0
