In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from scipy.sparse import hstack, csr_matrix

In [3]:
# Preprocessing function for tweets
def preprocess_tweet(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r"[^A-Za-z\s]", "", text)
    text = text.lower()
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [None]:
# Load the datasets
file_path = 'training-Obama-Romney-tweets.xlsx'
obama_data = pd.read_excel(file_path, sheet_name='Obama')


# Data Cleaning
obama_data_cleaned = obama_data[['Anootated tweet', 'Unnamed: 4']].rename(
    columns={'Anootated tweet': 'tweet', 'Unnamed: 4': 'sentiment'}
)
valid_sentiments = ['1', '-1', '0']
obama_data_cleaned = obama_data_cleaned[obama_data_cleaned['sentiment'].isin(valid_sentiments)]
obama_data_cleaned['sentiment'] = obama_data_cleaned['sentiment'].astype(int)
obama_data_cleaned['tweet'] = obama_data_cleaned['tweet'].fillna("").astype(str).apply(preprocess_tweet)

# Add metadata features
obama_data_cleaned['tweet_length'] = obama_data_cleaned['tweet'].apply(len)
obama_data_cleaned['num_hashtags'] = obama_data_cleaned['tweet'].apply(lambda x: x.count('#'))
obama_data_cleaned['num_mentions'] = obama_data_cleaned['tweet'].apply(lambda x: x.count('@'))

# TF-IDF Vectorization with N-grams
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),  # Unigrams and bigrams
    stop_words='english'  # Remove stopwords
)
tfidf_features = tfidf_vectorizer.fit_transform(obama_data_cleaned['tweet'])

# Convert metadata features to sparse matrix
metadata_features = obama_data_cleaned[['tweet_length', 'num_hashtags', 'num_mentions']].values
metadata_features_sparse = csr_matrix(metadata_features)

# Combine TF-IDF and metadata features
X_combined = hstack([tfidf_features, metadata_features_sparse])

# Target variable
y = obama_data_cleaned['sentiment']

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_combined, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, shuffle=True)

# SVM with GridSearchCV
param_grid_svm = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf'],
    'class_weight': ['balanced'],
    'gamma': ['scale', 'auto']
}
grid_search_svm = GridSearchCV(
    SVC(random_state=42),
    param_grid=param_grid_svm,
    scoring='accuracy',
    cv=3,
    verbose=2
)
grid_search_svm.fit(X_train, y_train)

# Evaluate the SVM model
y_pred_svm = grid_search_svm.best_estimator_.predict(X_test)
print("SVM Model Performance:")
print(classification_report(y_test, y_pred_svm))
print("Accuracy:", accuracy_score(y_test, y_pred_svm))


Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] END C=0.1, class_weight=balanced, gamma=scale, kernel=linear; total time=   0.0s
[CV] END C=0.1, class_weight=balanced, gamma=scale, kernel=linear; total time=   0.0s
[CV] END C=0.1, class_weight=balanced, gamma=scale, kernel=linear; total time=   0.0s
[CV] END C=0.1, class_weight=balanced, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END C=0.1, class_weight=balanced, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END C=0.1, class_weight=balanced, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END C=0.1, class_weight=balanced, gamma=auto, kernel=linear; total time=   0.0s
[CV] END C=0.1, class_weight=balanced, gamma=auto, kernel=linear; total time=   0.0s
[CV] END C=0.1, class_weight=balanced, gamma=auto, kernel=linear; total time=   0.0s
[CV] END C=0.1, class_weight=balanced, gamma=auto, kernel=rbf; total time=   0.0s
[CV] END C=0.1, class_weight=balanced, gamma=auto, kernel=rbf; total time=   0.0s
[CV] END C=0.

In [5]:
# Save TF-IDF and metadata vectorizer for future inference
import joblib
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(grid_search_svm.best_estimator_, 'svm_model.pkl')


#sample_file_path = 'sample-testdata.xlsx'
sample_data = pd.read_excel('final-testData-no-label-Obama-tweets.xlsx')

sample_data.columns = ['Index', 'tweet']

# Preprocess the sample dataset
sample_data_cleaned = sample_data[['tweet']]
sample_data_cleaned['tweet'] = sample_data_cleaned['tweet'].fillna("").astype(str).apply(preprocess_tweet)

# Transform the sample dataset using the trained TF-IDF vectorizer
sample_features = tfidf_vectorizer.transform(sample_data_cleaned['tweet'])

# Add metadata for the sample dataset
sample_data_cleaned['tweet_length'] = sample_data_cleaned['tweet'].apply(len)
sample_data_cleaned['num_hashtags'] = sample_data_cleaned['tweet'].apply(lambda x: x.count('#'))
sample_data_cleaned['num_mentions'] = sample_data_cleaned['tweet'].apply(lambda x: x.count('@'))
sample_metadata = sample_data_cleaned[['tweet_length', 'num_hashtags', 'num_mentions']].values
sample_metadata_sparse = csr_matrix(sample_metadata)

# Combine features
sample_combined = hstack([sample_features, sample_metadata_sparse])

# Predict sentiment for the sample dataset
sample_predictions = grid_search_svm.best_estimator_.predict(sample_combined)

# Save predictions to a .txt file
output_txt_path = 'sanjna-asritha-obama.txt'
with open(output_txt_path, 'w') as f:
    f.write("(setf x '(\n")  # Write the opening line
    for i, prediction in enumerate(sample_predictions, start=1):
        f.write(f"({i} {prediction})\n")  # Write each prediction as (tweet_number predict_label)
    f.write("\n) )\n")  # Write the closing line

print(f"Predictions saved to TXT: {output_txt_path}")

Predictions saved to TXT: sanjna-asritha-obama.txt


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_data_cleaned['tweet'] = sample_data_cleaned['tweet'].fillna("").astype(str).apply(preprocess_tweet)


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
from imblearn.over_sampling import SMOTE
import pandas as pd
import re

# Preprocessing function for tweets
def preprocess_tweet(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r"[^A-Za-z\s]", "", text)
    text = text.lower()
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Load the dataset
file_path = 'training-Obama-Romney-tweets.xlsx'
obama_data = pd.read_excel(file_path, sheet_name='Obama')

# Data Cleaning
obama_data_cleaned = obama_data[['Anootated tweet', 'Unnamed: 4']].rename(
    columns={'Anootated tweet': 'tweet', 'Unnamed: 4': 'sentiment'}
)
valid_sentiments = ['1', '-1', '0']
obama_data_cleaned = obama_data_cleaned[obama_data_cleaned['sentiment'].isin(valid_sentiments)]
obama_data_cleaned['sentiment'] = obama_data_cleaned['sentiment'].astype(int)
obama_data_cleaned['tweet'] = obama_data_cleaned['tweet'].fillna("").astype(str).apply(preprocess_tweet)

# Add metadata features
obama_data_cleaned['tweet_length'] = obama_data_cleaned['tweet'].apply(len)
obama_data_cleaned['num_hashtags'] = obama_data_cleaned['tweet'].apply(lambda x: x.count('#'))
obama_data_cleaned['num_mentions'] = obama_data_cleaned['tweet'].apply(lambda x: x.count('@'))

# TF-IDF Vectorization with N-grams
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),  # Unigrams and bigrams
    stop_words='english'  # Remove stopwords
)
tfidf_features = tfidf_vectorizer.fit_transform(obama_data_cleaned['tweet'])

# Convert metadata features to sparse matrix
metadata_features = obama_data_cleaned[['tweet_length', 'num_hashtags', 'num_mentions']].values
metadata_features_sparse = csr_matrix(metadata_features)

# Combine TF-IDF and metadata features
X_combined = hstack([tfidf_features, metadata_features_sparse])

# Target variable
y = obama_data_cleaned['sentiment']

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, shuffle=True)

# Logistic Regression with GridSearchCV
param_grid_lr = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'penalty': ['l2'],       # Regularization type
    'class_weight': ['balanced']  # Handle class imbalance
}
grid_search_lr = GridSearchCV(
    LogisticRegression(max_iter=500, random_state=42),
    param_grid=param_grid_lr,
    scoring='accuracy',
    cv=3,
    verbose=2
)
grid_search_lr.fit(X_train, y_train)

# Evaluate the Logistic Regression model
y_pred_lr = grid_search_lr.best_estimator_.predict(X_test)
print("Logistic Regression Model Performance:")
print(classification_report(y_test, y_pred_lr))
print("Accuracy:", accuracy_score(y_test, y_pred_lr))


Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] END ...........C=0.1, class_weight=balanced, penalty=l2; total time=   0.0s
[CV] END ...........C=0.1, class_weight=balanced, penalty=l2; total time=   0.0s
[CV] END ...........C=0.1, class_weight=balanced, penalty=l2; total time=   0.0s
[CV] END .............C=1, class_weight=balanced, penalty=l2; total time=   0.0s
[CV] END .............C=1, class_weight=balanced, penalty=l2; total time=   0.0s
[CV] END .............C=1, class_weight=balanced, penalty=l2; total time=   0.0s
[CV] END ............C=10, class_weight=balanced, penalty=l2; total time=   0.0s
[CV] END ............C=10, class_weight=balanced, penalty=l2; total time=   0.0s
[CV] END ............C=10, class_weight=balanced, penalty=l2; total time=   0.0s
[CV] END ...........C=100, class_weight=balanced, penalty=l2; total time=   0.0s
[CV] END ...........C=100, class_weight=balanced, penalty=l2; total time=   0.0s
[CV] END ...........C=100, class_weight=balanced,