# $\mu$ girls - Modeling

- Rani Misra, Cheryl Chiu, Abigail Davis, Kashfia Sharmin

In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Load the dataset
df = pd.read_csv("Personal Annotations.csv", usecols=["Comment", "Gold Standard"])
df['Comment'] = df['Comment'].fillna('').astype(str)

# Feature engineering based on text
df['num_caps'] = df['Comment'].apply(lambda x: sum(1 for word in x.split() if word.isupper()))
df['num_exclamations'] = df['Comment'].apply(lambda x: x.count('!'))
df['has_repeated_chars'] = df['Comment'].apply(lambda x: 1 if re.search(r'(.)\1{2,}', x) else 0)

# Print the first few rows to confirm
print(df.head())

# Vectorize the text using Bag of Words (CountVectorizer)
vectorizer = CountVectorizer(stop_words='english', max_features=1000)  # You can adjust the max_features as needed
X_text = vectorizer.fit_transform(df['Comment'])

# Combine the features: numerical features and the Bag of Words features
X_numeric = df[['num_caps', 'num_exclamations', 'has_repeated_chars']]
X_combined = np.hstack((X_numeric, X_text.toarray()))

# Target variable
y = df['Gold Standard']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Print the classification report to evaluate the model
print(classification_report(y_test, y_pred))

                                             Comment  Gold Standard  num_caps  \
0  For information regarding public services to b...     Technology         0   
1  Improve and fund public transportation to allo...  Public Safety         0   
2  More access to info like mutual aid and having...  Public Safety         0   
3  Greater access to transportation and more sola...     Technology         0   
4  Create a 'NYC time square' like location downt...     Technology         1   

   num_exclamations  has_repeated_chars  
0                 0                   0  
1                 0                   0  
2                 0                   0  
3                 0                   0  
4                 0                   0  
               precision    recall  f1-score   support

         Both       0.71      0.29      0.42        17
      Neither       0.75      0.87      0.81        79
Public Safety       0.50      0.33      0.40         6
   Technology       0.82      0.82      

In [3]:
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer, precision_recall_fscore_support

pipeline = make_pipeline(vectorizer, model)

def custom_scorer(y_true, y_pred):
    metrics = precision_recall_fscore_support(y_true, y_pred, average='weighted', zero_division=0)
    return {'precision': metrics[0], 'recall': metrics[1], 'f1': metrics[2]}

scoring = {
    'precision': make_scorer(lambda y_true, y_pred: custom_scorer(y_true, y_pred)['precision']),
    'recall': make_scorer(lambda y_true, y_pred: custom_scorer(y_true, y_pred)['recall']),
    'f1': make_scorer(lambda y_true, y_pred: custom_scorer(y_true, y_pred)['f1']),
}

cv_results = cross_validate(pipeline, df['Comment'], df['Gold Standard'], cv=5, scoring=scoring)

print("Precision (mean):", np.mean(cv_results['test_precision']))
print("Recall (mean):", np.mean(cv_results['test_recall']))
print("F1-Score (mean):", np.mean(cv_results['test_f1']))

Precision (mean): 0.7109864121591349
Recall (mean): 0.7140000000000001
F1-Score (mean): 0.6939051401499264


In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# Define the pipeline to include vectorization and logistic regression
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(stop_words='english', max_features=1000)),
    ('logreg', LogisticRegression(max_iter=1000))
])

# Define the hyperparameter grid
param_grid = {
    'logreg__C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'logreg__penalty': ['l2'],           
    'logreg__solver': ['lbfgs', 'saga']   # Solvers that support multi-class classification
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='f1_weighted',  # Focus on F1 score for imbalanced classes
    n_jobs=-1,  # Use all available cores
    verbose=2
)

# Perform the grid search
grid_search.fit(df['Comment'], df['Gold Standard'])

# Display the best parameters and the corresponding F1 score
print("Best Parameters:", grid_search.best_params_)
print("Best F1 Score:", grid_search.best_score_)

# Use the best model to make predictions
best_model = grid_search.best_estimator_
y_pred = best_model.predict(df['Comment'])

# Evaluate using the classification report
print(classification_report(df['Gold Standard'], y_pred))

Fitting 5 folds for each of 10 candidates, totalling 50 fits




Best Parameters: {'logreg__C': 1, 'logreg__penalty': 'l2', 'logreg__solver': 'lbfgs'}
Best F1 Score: 0.6939051401499264
               precision    recall  f1-score   support

         Both       1.00      0.88      0.93        89
      Neither       0.92      0.98      0.95       347
Public Safety       1.00      0.84      0.91        61
   Technology       0.98      0.97      0.97       503

     accuracy                           0.96      1000
    macro avg       0.97      0.92      0.94      1000
 weighted avg       0.96      0.96      0.96      1000





In [5]:
# Create a DataFrame with the comments, gold standard, and predictions
output_df = pd.DataFrame({
    'Comment': df['Comment'],  # Use the original comments from the DataFrame
    'Gold Standard': df['Gold Standard'],  # Gold standard values (true labels)
    'Prediction': y_pred  # Predictions from the best model
})

# Write the DataFrame to a CSV file
output_df.to_csv('Best_Model_Predictons.csv', index=False)

print("CSV file 'Best_Model_Predictons.csv' has been created.")

CSV file 'Best_Model_Predictons.csv' has been created.
