Classification

In [203]:
#import packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [205]:
#reading in the data
#training
tdata1 = pd.read_csv("/Users/stolieerickson/Documents/MSBA/Fall/GSB544-Computing/Final/political_train.csv")
#validation
vdata1 = pd.read_csv("/Users/stolieerickson/Documents/MSBA/Fall/GSB544-Computing/Final/political_test.csv")
#predictors and response
X = tdata1.drop(columns=['id_num', 'political_affiliation'], axis=1)
y = tdata1['political_affiliation']

In [324]:
tdata1.isnull().sum()

id_num                   0
Q1                       0
Q2                       0
political_affiliation    0
Q4                       0
Q5                       0
Q6                       0
Q7                       0
Q8                       0
Q9                       0
Q10                      0
Q11                      0
Q12                      0
Q13                      0
Q14                      0
Q15                      0
Q16                      0
Q17                      0
Q18                      0
dtype: int64

In [323]:
#preprocessing
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='mean')), 
        ('scaler', StandardScaler())
    ]), make_column_selector(dtype_include=np.number)),
    
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ]), make_column_selector(dtype_include=object))
])

#pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', SVC())
])

#define tuning grid
param_grid = {
    'classifier__C': [0.1, 1, 10],          
    'classifier__kernel': ['linear', 'rbf'], 
    'classifier__gamma': ['scale', 'auto']  
}

#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#gridsearch
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=10,  
    n_jobs=-1,  
    verbose=2  
)

#train with the best parameters
grid_search.fit(X_train, y_train)

#print
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validated Accuracy:", grid_search.best_score_)

#make predictions
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

#evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("Test Set Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits


Best Parameters: {'classifier__C': 0.1, 'classifier__gamma': 'scale', 'classifier__kernel': 'linear'}
Best Cross-Validated Accuracy: 0.6510989010989012
Test Set Accuracy: 0.5588235294117647

Classification Report:
               precision    recall  f1-score   support

    Democrat       0.67      0.67      0.67        12
 Independent       0.30      0.33      0.32         9
  Republican       0.67      0.62      0.64        13

    accuracy                           0.56        34
   macro avg       0.54      0.54      0.54        34
weighted avg       0.57      0.56      0.56        34

Best Parameters: {'classifier__C': 0.1, 'classifier__gamma': 'scale', 'classifier__kernel': 'linear'}
Best Cross-Validated Accuracy: 0.6510989010989012
Test Set Accuracy: 0.5588235294117647

Classification Report:
               precision    recall  f1-score   support

    Democrat       0.67      0.67      0.67        12
 Independent       0.30      0.33      0.32         9
  Republican       0.67   

Model specifications with the most similar cv and test accuracy:

model1: 
- classifier_C = 10
- classifier_gamma = auto
- classifier_kernel = rbf

model2:
- classifier_C = 0.1
- classifier_gamma = scale
- classifier_kernel = linear

model3:
- classifier_C = 1
- classifier_gamma = scale
- classifier_kernel = linear

In [331]:
#model1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)

#pipeline
pipeline_model1 = Pipeline([
    ('preprocessor', preprocessor),
    ('model', SVC(C=10, gamma='auto', kernel='rbf'))
])

#fit the model
pipeline_model1.fit(X_train, y_train)

#predict and evaluate
y_pred = pipeline_model1.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

#print
print(f"Model 1 - SVM (C=10, gamma='auto', kernel='rbf')")
print(f"Test Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Model 1 - SVM (C=10, gamma='auto', kernel='rbf')
Test Accuracy: 0.6471

Classification Report:
               precision    recall  f1-score   support

    Democrat       0.64      0.78      0.70         9
 Independent       0.60      0.46      0.52        13
  Republican       0.69      0.75      0.72        12

    accuracy                           0.65        34
   macro avg       0.64      0.66      0.65        34
weighted avg       0.64      0.65      0.64        34

Model 1 - SVM (C=10, gamma='auto', kernel='rbf')
Test Accuracy: 0.6471

Classification Report:
               precision    recall  f1-score   support

    Democrat       0.64      0.78      0.70         9
 Independent       0.60      0.46      0.52        13
  Republican       0.69      0.75      0.72        12

    accuracy                           0.65        34
   macro avg       0.64      0.66      0.65        34
weighted avg       0.64      0.65      0.64        34



In [318]:
#model2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)

#pipeline
pipeline_model2 = Pipeline([
    ('preprocessor', preprocessor),
    ('model', SVC(C=0.1, gamma='scale', kernel='linear'))
])

#fit the model
pipeline_model2.fit(X_train, y_train)

#predict and evaluate
y_pred = pipeline_model2.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

#print
print(f"Model 2 - SVM (C=0.1, gamma='scale', kernel='linear')")
print(f"Test Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Model 2 - SVM (C=0.1, gamma='scale', kernel='linear')
Test Accuracy: 0.7059

Classification Report:
               precision    recall  f1-score   support

    Democrat       0.53      0.80      0.64        10
 Independent       0.78      0.58      0.67        12
  Republican       0.90      0.75      0.82        12

    accuracy                           0.71        34
   macro avg       0.74      0.71      0.71        34
weighted avg       0.75      0.71      0.71        34

Model 2 - SVM (C=0.1, gamma='scale', kernel='linear')
Test Accuracy: 0.7059

Classification Report:
               precision    recall  f1-score   support

    Democrat       0.53      0.80      0.64        10
 Independent       0.78      0.58      0.67        12
  Republican       0.90      0.75      0.82        12

    accuracy                           0.71        34
   macro avg       0.74      0.71      0.71        34
weighted avg       0.75      0.71      0.71        34



In [362]:
#model3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)

#pipeline
pipeline_model3 = Pipeline([
    ('preprocessor', preprocessor),
    ('model', SVC(C=1, gamma='scale', kernel='linear'))
])

#fit the model
pipeline_model3.fit(X_train, y_train)

#predict and evaluate
y_pred = pipeline_model3.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

#print
print(f"Model 3 - SVM (C=1, gamma='scale', kernel='linear')")
print(f"Test Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Model 3 - SVM (C=1, gamma='scale', kernel='linear')
Test Accuracy: 0.6765

Classification Report:
               precision    recall  f1-score   support

    Democrat       0.67      0.40      0.50        10
 Independent       0.54      0.64      0.58        11
  Republican       0.80      0.92      0.86        13

    accuracy                           0.68        34
   macro avg       0.67      0.65      0.65        34
weighted avg       0.68      0.68      0.66        34

Model 3 - SVM (C=1, gamma='scale', kernel='linear')
Test Accuracy: 0.6765

Classification Report:
               precision    recall  f1-score   support

    Democrat       0.67      0.40      0.50        10
 Independent       0.54      0.64      0.58        11
  Republican       0.80      0.92      0.86        13

    accuracy                           0.68        34
   macro avg       0.67      0.65      0.65        34
weighted avg       0.68      0.68      0.66        34



## Attempt 1 (Model2)

In [367]:
#drop id
X_test = vdata1.drop(columns=['id_num'], axis=1)

#pipeline
pipeline_model2 = Pipeline([
    ('preprocessor', preprocessor),
    ('model', SVC(C=0.1, gamma='scale', kernel='linear'))
])

#fit the model on the entire dataset
pipeline_model2.fit(X, y)

#predict
political_affiliation_predictions = pipeline_model2.predict(X_test)

#save to dataframe
final_predictions = pd.DataFrame({
    "id_num": vdata1['id_num'],
    "political_affiliation_predicted": political_affiliation_predictions
})

#write to a csv
final_predictions.to_csv("/Users/stolieerickson/Documents/MSBA/Fall/GSB544-Computing/Final/final_party_predictions.csv", index=False)


political_affiliation_predicted
Independent    37.951807
Democrat       37.349398
Republican     24.698795
Name: proportion, dtype: float64

## Attempt 2 

In [369]:

#drop id column
X_test = vdata1.drop(columns=['id_num'], axis=1)

#pipeline
pipeline_model1 = Pipeline([
    ('preprocessor', preprocessor),
    ('model', SVC(C=10, gamma='auto', kernel='rbf'))
])

# fit the model on the entire dataset
pipeline_model1.fit(X, y)

#predict
political_affiliation_predictions = pipeline_model1.predict(X_test)

#create dataframe
final_predictions = pd.DataFrame({
    "id_num": vdata1['id_num'],
    "political_affiliation_predicted": political_affiliation_predictions
})

#write to a csv file
final_predictions.to_csv("/Users/stolieerickson/Documents/MSBA/Fall/GSB544-Computing/Final/final_party_predictions2.csv", index=False)

political_affiliation_predicted
Independent    39.759036
Democrat       36.746988
Republican     23.493976
Name: proportion, dtype: float64

## Attempt 3

In [371]:
#drop id
X_test = vdata1.drop(columns=['id_num'], axis=1)

#pipeline
pipeline_model3 = Pipeline([
    ('preprocessor', preprocessor),
    ('model', SVC(C=1, gamma='scale', kernel='linear'))
])

#fit on entire dataset
pipeline_model3.fit(X, y)

#predict
political_affiliation_predictions = pipeline_model3.predict(X_test)

#save to dataframe
final_predictions = pd.DataFrame({
    "id_num": vdata1['id_num'],
    "political_affiliation_predicted": political_affiliation_predictions
})


   id_num  ... political_affiliation_predicted_2
0       2  ...                        Republican
1       3  ...                          Democrat
2       4  ...                       Independent
3       6  ...                        Republican
4      11  ...                       Independent

[5 rows x 3 columns]
Number of matching predictions: 152 out of 166
Percentage of matching predictions: 91.57%
   id_num  ... political_affiliation_predicted_2
0       2  ...                        Republican
1       3  ...                          Democrat
2       4  ...                       Independent
3       6  ...                        Republican
4      11  ...                       Independent

[5 rows x 3 columns]
Number of matching predictions: 152 out of 166
Percentage of matching predictions: 91.57%


# Regression

In [154]:
#reading in the data
#training
tdata2 = pd.read_csv("/Users/stolieerickson/Documents/MSBA/Fall/GSB544-Computing/Final/house_train.csv")
#validation
vdata2 = pd.read_csv("/Users/stolieerickson/Documents/MSBA/Fall/GSB544-Computing/Final/house_test.csv")

## Attempt 1

In [200]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer  # Imputer to handle missing values
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings('ignore', category=ConvergenceWarning)


#predictor and response
X = tdata2.drop(columns=['SalePrice', 'PID']) 
y = np.log(tdata2['SalePrice'])

#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#preprocessing
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='mean')), 
        ('scaler', StandardScaler())
    ]), make_column_selector(dtype_include=np.number)),
    
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ]), make_column_selector(dtype_include=object))
])

#pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', ElasticNet())
])

#grid search tuning
param_grid = {
    'model__alpha': np.logspace(-3, 1, 10),  
    'model__l1_ratio': np.linspace(0, 1, 10) 
}

#grid search cross validation
grid_search = GridSearchCV(
    pipeline, 
    param_grid=param_grid, 
    cv=10, 
    scoring='neg_root_mean_squared_error', 
    n_jobs=-1
)

#fit the model
grid_search.fit(X_train, y_train)

#evaluate
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

#print results
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Test RMSE: {rmse:.4f}")


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


Best Parameters: {'model__alpha': 0.001, 'model__l1_ratio': 0.0}
Test RMSE: 0.1431
Best Parameters: {'model__alpha': 0.001, 'model__l1_ratio': 0.0}
Test RMSE: 0.1431


In [173]:
#using the paramaters found above
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)
pipeline_elastic = Pipeline([
    ('preprocessor', preprocessor),
    ('model', ElasticNet(alpha=0.001, l1_ratio=0.1111))
])

#fit the model
pipeline_elastic.fit(X_train, y_train)

#predict and evaluate
y_pred = pipeline_elastic.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

#print results
print(f"Test RMSE: {rmse:.4f}")

Test RMSE: 0.1328
Test RMSE: 0.1328


In [177]:
#fit final model
final_model = pipeline_elastic.fit(X, y)

#predict on vdata2
X_test = vdata2.drop(columns=['PID'])
pids = vdata2['PID']

#predictions
y_pred_log = final_model.predict(X_test) 
#exponentiate
y_pred = np.exp(y_pred_log) 

#create dataframe
results_df = pd.DataFrame({
    'PID': pids, 
    'SalePrice': y_pred
})

#write to a csv
results_df.to_csv('predicted_sale_prices.csv', index=False)

len(results_df)

605

## Attempt 2

In [198]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)
pipeline_elastic2 = Pipeline([
    ('preprocessor', preprocessor),
    ('model', ElasticNet(alpha=0.0027825594022071257, l1_ratio=0.0))
])

#fit the model
pipeline_elastic2.fit(X_train, y_train)

#predict and evaluate
y_pred = pipeline_elastic2.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

#print results
print(f"Test RMSE: {rmse:.4f}")

  model = cd_fast.sparse_enet_coordinate_descent(


Test RMSE: 0.1204
Test RMSE: 0.1204


In [199]:
#fit the model
final_model2 = pipeline_elastic2.fit(X, y)

#predict on vdata2
X_test = vdata2.drop(columns=['PID']) 
pids = vdata2['PID'] 

#predict
y_pred_log = final_model2.predict(X_test)
#exponentiate
y_pred = np.exp(y_pred_log)

#dataframe
results_df2 = pd.DataFrame({
    'PID': pids, 
    'SalePrice': y_pred 
})

#write to a csv
results_df2.to_csv('predicted_sale_prices2.csv', index=False)

len(results_df2)

  model = cd_fast.sparse_enet_coordinate_descent(


605