In [8]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer

# Preprocessing function
def preprocess_tweet(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r"<.*?>", "", text)  # Remove HTML tags
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r"[^A-Za-z0-9\s#@]", "", text)  # Keep hashtags and mentions
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra whitespaces
    return text

# Load the Romney training dataset (ensure headers are correctly read)
file_path = 'training-Obama-Romney-tweets.xlsx'
romney_data = pd.read_excel(file_path, sheet_name='Romney', header=0)

# Debug: Inspect raw data
print("Raw data shape:", romney_data.shape)
print("First 5 rows of raw data:")
print(romney_data.head())

# Rename columns
romney_data_cleaned = romney_data[['Anootated tweet', 'Unnamed: 4']].rename(
    columns={'Anootated tweet': 'tweet', 'Unnamed: 4': 'sentiment'}
)

# Debug: Check sentiment column
print("Unique values in sentiment column before cleaning:")
print(romney_data_cleaned['sentiment'].unique())

# Map textual sentiment values to numeric values (if necessary)
valid_sentiments = [-1, 0, 1]  # Valid sentiment values
romney_data_cleaned['sentiment'] = pd.to_numeric(romney_data_cleaned['sentiment'], errors='coerce')
romney_data_cleaned = romney_data_cleaned[romney_data_cleaned['sentiment'].isin(valid_sentiments)]

# Debug: Check sentiment column after filtering
print("Unique values in sentiment column after filtering:")
print(romney_data_cleaned['sentiment'].unique())

# Drop rows with missing or invalid sentiment values
romney_data_cleaned = romney_data_cleaned.dropna(subset=['sentiment'])

# Debug: Check data shape after sentiment filtering
print("Data shape after filtering sentiment:", romney_data_cleaned.shape)
print("First 5 rows after filtering sentiment:")
print(romney_data_cleaned.head())

# Preprocess tweets
romney_data_cleaned['tweet'] = romney_data_cleaned['tweet'].fillna("").astype(str).apply(preprocess_tweet)

# Debug: Check tweet column after preprocessing
print("First 5 rows of tweet column after preprocessing:")
print(romney_data_cleaned[['tweet', 'sentiment']].head())

# Check for empty tweets after preprocessing
romney_data_cleaned = romney_data_cleaned[romney_data_cleaned['tweet'].str.strip() != ""]

# Debug: Check data shape after tweet preprocessing
print("Data shape after tweet preprocessing:", romney_data_cleaned.shape)

# Check if dataset is empty
if romney_data_cleaned.empty:
    print("The cleaned dataset is empty after preprocessing!")
else:
    print("The cleaned dataset is not empty. Proceeding to feature extraction.")

    # Add metadata features
    romney_data_cleaned['tweet_length'] = romney_data_cleaned['tweet'].apply(len)
    romney_data_cleaned['num_hashtags'] = romney_data_cleaned['tweet'].apply(lambda x: x.count('#'))
    romney_data_cleaned['num_mentions'] = romney_data_cleaned['tweet'].apply(lambda x: x.count('@'))

    # TF-IDF Vectorization
    tfidf_vectorizer_romney = TfidfVectorizer(
        max_features=5000,
        ngram_range=(1, 2),  # Unigrams and bigrams
        stop_words='english'  # Remove stopwords
    )
    tfidf_features_romney = tfidf_vectorizer_romney.fit_transform(romney_data_cleaned['tweet'])

    # Combine TF-IDF and metadata features
    metadata_features = romney_data_cleaned[['tweet_length', 'num_hashtags', 'num_mentions']].values
    metadata_features_sparse = csr_matrix(metadata_features)
    X_combined = hstack([tfidf_features_romney, metadata_features_sparse])

    # Debug: Check feature matrix shape
    print("Shape of combined feature matrix:", X_combined.shape)

    # Save cleaned data for further processing
    output_cleaned_path = 'romney_cleaned_data_fixed.xlsx'
    romney_data_cleaned.to_excel(output_cleaned_path, index=False)
    print(f"Cleaned Romney dataset saved to: {output_cleaned_path}")


Raw data shape: (7201, 6)
First 5 rows of raw data:
   Unnamed: 0                 date            time  \
0         NaN                  NaN             NaN   
1         NaN  2012-10-16 00:00:00  09:38:08-05:00   
2         NaN  2012-10-16 00:00:00  10:22:34-05:00   
3         NaN  2012-10-16 00:00:00  10:14:18-05:00   
4         NaN  2012-10-16 00:00:00  09:27:16-05:00   

                                     Anootated tweet Unnamed: 4  \
0    1: positive, -1: negative, 0: neutral, 2: mixed      Class   
1  Insidious!<e>Mitt Romney</e>'s Bain Helped Phi...         -1   
2  Senior <e>Romney</e> Advisor Claims <e>Obama</...          2   
3  .@WardBrenda @shortwave8669 @allanbourdius you...         -1   
4  <e>Mitt Romney</e> still doesn't <a>believe</a...         -1   

         Unnamed: 5  
0  Your class label  
1               NaN  
2               NaN  
3               NaN  
4               NaN  
Unique values in sentiment column before cleaning:
['Class' -1 2 1 '!!!!' 0 nan 'IR']
Un

In [9]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score

# Load the cleaned Romney dataset
cleaned_file_path = 'romney_cleaned_data_fixed.xlsx'
romney_data_cleaned = pd.read_excel(cleaned_file_path)

# Features and target
X_metadata = romney_data_cleaned[['tweet_length', 'num_hashtags', 'num_mentions']].values
tfidf_vectorizer_romney = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),  # Unigrams and bigrams
    stop_words='english'  # Remove stopwords
)
tfidf_features = tfidf_vectorizer_romney.fit_transform(romney_data_cleaned['tweet'])
X_combined = hstack([tfidf_features, csr_matrix(X_metadata)])
y = romney_data_cleaned['sentiment']

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_combined, y)

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train a Logistic Regression model
logistic_model = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)
print("Training the Logistic Regression model...")
logistic_model.fit(X_train, y_train)

# Evaluate the Logistic Regression model
print("Evaluating the Logistic Regression model...")
y_pred_logistic = logistic_model.predict(X_test)

# Metrics for Logistic Regression
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
classification_rep_logistic = classification_report(y_test, y_pred_logistic, target_names=['Negative', 'Neutral', 'Positive'])

# Print results
print("Logistic Regression Model Performance:")
print(f"Accuracy: {accuracy_logistic:.2f}")
print("\nClassification Report:\n", classification_rep_logistic)


Training the Logistic Regression model...
Evaluating the Logistic Regression model...
Logistic Regression Model Performance:
Accuracy: 0.72

Classification Report:
               precision    recall  f1-score   support

    Negative       0.70      0.65      0.68       591
     Neutral       0.66      0.66      0.66       571
    Positive       0.78      0.84      0.81       574

    accuracy                           0.72      1736
   macro avg       0.71      0.72      0.72      1736
weighted avg       0.71      0.72      0.71      1736



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
# Load the sample test data file
sample_file_path = 'final-testData-no-label-Romney-tweets.xlsx'  # Update with the actual path to the sample test file
sample_data = pd.read_excel(sample_file_path, sheet_name='Romney')  # Ensure you're reading the correct sheet

sample_data.columns=['Index','tweet']
# Preprocess the sample dataset
sample_data_cleaned = sample_data[['tweet']]
sample_data_cleaned['tweet'] = sample_data_cleaned['tweet'].fillna("").astype(str).apply(tfidf_vectorizer_romney.build_preprocessor())

# Transform the sample dataset using the trained TF-IDF vectorizer
sample_features = tfidf_vectorizer_romney.transform(sample_data_cleaned['tweet'])

# Add metadata for the sample dataset
sample_data_cleaned['tweet_length'] = sample_data_cleaned['tweet'].apply(len)
sample_data_cleaned['num_hashtags'] = sample_data_cleaned['tweet'].apply(lambda x: x.count('#'))
sample_data_cleaned['num_mentions'] = sample_data_cleaned['tweet'].apply(lambda x: x.count('@'))
sample_metadata = sample_data_cleaned[['tweet_length', 'num_hashtags', 'num_mentions']].values
sample_metadata_sparse = csr_matrix(sample_metadata)

# Combine features
sample_combined = hstack([sample_features, sample_metadata_sparse])

# Predict sentiment for the sample dataset
print("Generating predictions on the sample dataset...")
sample_predictions = logistic_model.predict(sample_combined)

# Prepare results in the desired format
output_lines = ["(setf x '(\n"]
for index, prediction in enumerate(sample_predictions, start=1):
    output_lines.append(f"({index} {int(prediction)})\n")
output_lines.append(") )\n")

# Save predictions to a text file
output_file_path = 'sanjna-asritha-romney.txt'  # Update with desired output path
with open(output_file_path, 'w') as f:
    f.writelines(output_lines)

print(f"Predictions saved to: {output_file_path}")


Generating predictions on the sample dataset...
Predictions saved to: sanjna-asritha-romney.txt


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_data_cleaned['tweet'] = sample_data_cleaned['tweet'].fillna("").astype(str).apply(tfidf_vectorizer_romney.build_preprocessor())
