In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score


In [30]:
# Step 1: Data Understanding
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [31]:
train_data.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [32]:
train_data

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
...,...,...,...
31957,31958,0,ate @user isz that youuu?ðððððð...
31958,31959,0,to see nina turner on the airwaves trying to...
31959,31960,0,listening to sad songs on a monday morning otw...
31960,31961,1,"@user #sikh #temple vandalised in in #calgary,..."


In [33]:
test_data

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."
...,...,...
17192,49155,thought factory: left-right polarisation! #tru...
17193,49156,feeling like a mermaid ð #hairflip #neverre...
17194,49157,#hillary #campaigned today in #ohio((omg)) &am...
17195,49158,"happy, at work conference: right mindset leads..."


In [34]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [35]:
# Step 2: Data Preprocessing
# Text cleaning
def clean_text(text):
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text


In [36]:
# Tokenization and removing stopwords
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)


In [37]:
# Apply text cleaning and preprocessing to 'tweet' column
train_data['cleaned_tweet'] = train_data['tweet'].apply(clean_text)
train_data['preprocessed_tweet'] = train_data['cleaned_tweet'].apply(preprocess_text)

In [38]:
# Display the updated train data
print(train_data.head())

   id  label                                              tweet  \
0   1      0   @user when a father is dysfunctional and is s...   
1   2      0  @user @user thanks for #lyft credit i can't us...   
2   3      0                                bihday your majesty   
3   4      0  #model   i love u take with u all the time in ...   
4   5      0             factsguide: society now    #motivation   

                                       cleaned_tweet  \
0   user when a father is dysfunctional and is so...   
1  user user thanks for lyft credit i cant use ca...   
2                                bihday your majesty   
3  model   i love u take with u all the time in u...   
4               factsguide society now    motivation   

                                  preprocessed_tweet  
0  user father dysfunctional selfish drags kids d...  
1  user user thanks lyft credit cant use cause do...  
2                                     bihday majesty  
3                        model love u ta

In [39]:
# Apply text cleaning and preprocessing to 'tweet' column
test_data['cleaned_tweet'] = train_data['tweet'].apply(clean_text)
test_data['preprocessed_tweet'] = train_data['cleaned_tweet'].apply(preprocess_text)

In [40]:
# Step 3: Feature Engineering
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limiting to top 5000 features


In [41]:
# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limiting to top 5000 features


In [42]:
# Transform the training data
X_train = tfidf_vectorizer.fit_transform(train_data['preprocessed_tweet'])
y_train = train_data['label']


In [43]:
# Transform the test data
X_test = tfidf_vectorizer.transform(test_data['preprocessed_tweet'])


In [44]:
# Step 4: Model Selection and Training
from sklearn.linear_model import LogisticRegression


In [45]:
# Initialize and train the Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


In [46]:
# Step 5: Model Evaluation
from sklearn.metrics import accuracy_score, classification_report


In [47]:
# Predict on the training data
y_pred_train = model.predict(X_train)

In [48]:
# Evaluate the model
print("Training Accuracy:", accuracy_score(y_train, y_pred_train))
print("Classification Report:")
print(classification_report(y_train, y_pred_train))

Training Accuracy: 0.957167886865653
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     29720
           1       0.95      0.41      0.57      2242

    accuracy                           0.96     31962
   macro avg       0.95      0.71      0.78     31962
weighted avg       0.96      0.96      0.95     31962



In [49]:
# Step 6: Predicting on Test Data
y_pred_test = model.predict(X_test)

In [50]:
# Step 6: Save Predictions
test_data['predicted_label'] = y_pred_test
test_data.to_csv('test_predictions.csv', index=False)

In [51]:
test_data

Unnamed: 0,id,tweet,cleaned_tweet,preprocessed_tweet,predicted_label
0,31963,#studiolife #aislife #requires #passion #dedic...,user when a father is dysfunctional and is so...,user father dysfunctional selfish drags kids d...,0
1,31964,@user #white #supremacists want everyone to s...,user user thanks for lyft credit i cant use ca...,user user thanks lyft credit cant use cause do...,0
2,31965,safe ways to heal your #acne!! #altwaystohe...,bihday your majesty,bihday majesty,0
3,31966,is the hp and the cursed child book up for res...,model i love u take with u all the time in u...,model love u take u time ur,0
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",factsguide society now motivation,factsguide society motivation,0
...,...,...,...,...,...
17192,49155,thought factory: left-right polarisation! #tru...,photobomb bull up you will dominate your bul...,photobomb bull dominate bull direct whatever w...,0
17193,49156,feeling like a mermaid ð #hairflip #neverre...,good news guys its friday almostthere wemad...,good news guys friday almostthere wemadeit cof...,0
17194,49157,#hillary #campaigned today in #ohio((omg)) &am...,fighting battles that have already been won in...,fighting battles already instead fighting mass...,0
17195,49158,"happy, at work conference: right mindset leads...",life is way too sho to waste it i say bring o...,life way sho waste say bring anything run swea...,0


In [52]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [55]:
# Define y_test
y_test = test_data['predicted_label']

In [56]:
# Step 7: Model Evaluation on Test Data
y_test_pred = model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("Classification Report on Test Data:")
print(classification_report(y_test, y_test_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))

Test Accuracy: 1.0
Classification Report on Test Data:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16652
           1       1.00      1.00      1.00       545

    accuracy                           1.00     17197
   macro avg       1.00      1.00      1.00     17197
weighted avg       1.00      1.00      1.00     17197

Confusion Matrix:
[[16652     0]
 [    0   545]]


In [57]:
# Step 8: Analysis of Results
# Example of correctly classified tweets
correctly_classified_indices = (y_test == y_test_pred)
correctly_classified_tweets = test_data[correctly_classified_indices]['preprocessed_tweet'].head(5).tolist()
print("\nExamples of Correctly Classified Tweets:")
for tweet in correctly_classified_tweets:
    print(tweet)


Examples of Correctly Classified Tweets:
user father dysfunctional selfish drags kids dysfunction run
user user thanks lyft credit cant use cause dont offer wheelchair vans pdx disapointed getthanked
bihday majesty
model love u take u time ur
factsguide society motivation


In [58]:
# Example of misclassified tweets
misclassified_indices = (y_test != y_test_pred)
misclassified_tweets = test_data[misclassified_indices]['preprocessed_tweet'].head(5).tolist()
print("\nExamples of Misclassified Tweets:")
for tweet in misclassified_tweets:
    print(tweet)


Examples of Misclassified Tweets:


In [59]:
test_data

Unnamed: 0,id,tweet,cleaned_tweet,preprocessed_tweet,predicted_label
0,31963,#studiolife #aislife #requires #passion #dedic...,user when a father is dysfunctional and is so...,user father dysfunctional selfish drags kids d...,0
1,31964,@user #white #supremacists want everyone to s...,user user thanks for lyft credit i cant use ca...,user user thanks lyft credit cant use cause do...,0
2,31965,safe ways to heal your #acne!! #altwaystohe...,bihday your majesty,bihday majesty,0
3,31966,is the hp and the cursed child book up for res...,model i love u take with u all the time in u...,model love u take u time ur,0
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",factsguide society now motivation,factsguide society motivation,0
...,...,...,...,...,...
17192,49155,thought factory: left-right polarisation! #tru...,photobomb bull up you will dominate your bul...,photobomb bull dominate bull direct whatever w...,0
17193,49156,feeling like a mermaid ð #hairflip #neverre...,good news guys its friday almostthere wemad...,good news guys friday almostthere wemadeit cof...,0
17194,49157,#hillary #campaigned today in #ohio((omg)) &am...,fighting battles that have already been won in...,fighting battles already instead fighting mass...,0
17195,49158,"happy, at work conference: right mindset leads...",life is way too sho to waste it i say bring o...,life way sho waste say bring anything run swea...,0


In [62]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report


In [63]:

# Model Performance Metrics
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("Precision:", precision_score(y_test, y_test_pred))
print("Recall:", recall_score(y_test, y_test_pred))
print("F1 Score:", f1_score(y_test, y_test_pred))



Test Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0


In [64]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_test_pred)
print("\nConfusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[16652     0]
 [    0   545]]


In [65]:
# Classification Report
print("\nClassification Report on Test Data:")
print(classification_report(y_test, y_test_pred))


Classification Report on Test Data:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16652
           1       1.00      1.00      1.00       545

    accuracy                           1.00     17197
   macro avg       1.00      1.00      1.00     17197
weighted avg       1.00      1.00      1.00     17197

