# CS 6830 Project 7
Seth Beckett & Dave Storey

In [None]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

import seaborn as sns
import matplotlib.pyplot as plt

## Mushroom Dataset
This first part is the dataset that Seth worked with. It concerns classifying whether or not mushrooms are poisonous based on many different categorical attributes. More details about what the specific categories and decoding them can be found [here](https://www.kaggle.com/datasets/uciml/mushroom-classification).

In [None]:
shrooms = pd.read_csv("data/mushrooms.csv")
display(shrooms.columns)
shrooms.head()

### Full Models
First we will try creating logistic regression models and a simple linear SVC with all of the data, and test how both perform. The GridSearchCV can take a while to fit, so I am just using the optimal parameters I found here.

In [None]:
X = shrooms.drop("class", axis=1)
X = pd.get_dummies(X)
y = shrooms["class"]
y = y.map({"p": 1, "e": 0})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

print("Shape of data: ", X.shape)

In [None]:
full_log_model = LogisticRegression(C=1, solver='liblinear', max_iter=1000)
full_log_model.fit(X_train, y_train)
y_pred = full_log_model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
full_linear_svc = SVC(kernel='linear', C=1)
full_linear_svc.fit(X_train, y_train)
y_pred = full_linear_svc.predict(X_test)
print(classification_report(y_test, y_pred))

### Top 5 Features
As we can see, both of these models end up with perfect accuracy, precision, and recall. We know that we have a lot of data, so in order to make our model simpler we will try using the top 5 features and see how that affects our metrics.

In [None]:
# find which features are affect logistic prediction the most
log_coef = pd.DataFrame({"feature": X.columns, "coef": full_log_model.coef_[0]})
log_coef['abs_coef'] = log_coef['coef'].abs()
log_coef = log_coef.sort_values('abs_coef', ascending=False)

In [None]:
# logreg performance with top 5 features
top5_log_model = LogisticRegression(C=1, solver='liblinear', max_iter=1000)
top5_log_model.fit(X_train[log_coef['feature'].iloc[:5]], y_train)
y_pred = top5_log_model.predict(X_test[log_coef['feature'].iloc[:5]])
print(classification_report(y_test, y_pred))

disp = ConfusionMatrixDisplay(confusion_matrix(y_test, y_pred), display_labels=["edible", "poisonous"])
disp.plot()
plt.title("Confusion Matrix for Top 5 Logistic Regression Features")
plt.show()

In [None]:
# find which features are affect SVC prediction the most
svc_coef = pd.DataFrame({"feature": X.columns, "coef": full_linear_svc.coef_[0]})
svc_coef['abs_coef'] = svc_coef['coef'].abs()
svc_coef = svc_coef.sort_values('abs_coef', ascending=False)


In [None]:
# SVC performance with top 5 features
top5_svc = SVC(kernel='linear', C=1)
top5_svc.fit(X_train[svc_coef['feature'].iloc[:5]], y_train)
y_pred = top5_svc.predict(X_test[svc_coef['feature'].iloc[:5]])
print(classification_report(y_test, y_pred))

disp = ConfusionMatrixDisplay(confusion_matrix(y_test, y_pred), display_labels=["edible", "poisonous"])
disp.plot()
plt.title("Confusion Matrix for Top 5 SVC Features")
plt.show()

### Top 3 Features
That is insanely good for only considering 5 features. Let's see how accuracy is affected when we drop it down to the top 3 features for each.

# evaluate logreg performance with top 3 features
top3_log_model = LogisticRegression(C=1, solver='liblinear', max_iter=1000)
top3_log_model.fit(X_train[log_coef['feature'].iloc[:3]], y_train)
y_pred = top3_log_model.predict(X_test[log_coef['feature'].iloc[:3]])
print(classification_report(y_test, y_pred))

disp = ConfusionMatrixDisplay(confusion_matrix(y_test, y_pred), display_labels=["edible", "poisonous"])
disp.plot()
plt.title("Confusion Matrix for Top 3 Logistic Regression Features")
plt.show()

In [None]:
# evaluate SVC performance with top 3 features
top3_svc = SVC(kernel='linear', C=1)
top3_svc.fit(X_train[svc_coef['feature'].iloc[:3]], y_train)
y_pred = top3_svc.predict(X_test[svc_coef['feature'].iloc[:3]])
print(classification_report(y_test, y_pred))

disp = ConfusionMatrixDisplay(confusion_matrix(y_test, y_pred), display_labels=["edible", "poisonous"])
disp.plot()
plt.title("Confusion Matrix for Top 3 SVC Features")
plt.show()

In [None]:
# print the top 5 logreg and top 5 svc features w/ their coefficients
print("Top 5 Logistic Regression Features")
display(log_coef.iloc[:5])

print("Top 5 SVC Features")
display(svc_coef.iloc[:5])

### Conclusion
As we can see, the logistic regression model using the features "odor_n", "spore-print-color_r", and "odor_l" performs amazingly well. Both SVC and logistic regression only have 8 misclassifications when using their top 5 features, but SVC can't be trusted to save you from poisonous mushrooms if you only use the top 3 features.

Decoding the odor variables goes as follows: almond=a, anise=l, creosote=c, fishy=y, foul=f, musty=m, none=n, pungent=p, spicy=s. For spore print color, it goes as follows: black=k, brown=n, buff=b, chocolate=h, green=r, orange=o, purple=u, white=w, yellow=y.

One interesting future direction one could take is to try looking at feature importance when not considering odor, since odor might be tricky to identify for some people.

# Airline Dataset

In [None]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn import svm
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression

# Normalizes a dataframes integer columns
def normalize(df):
	copy = df.copy()
	for col in df.columns:
		max = df[col].max()
		min = df[col].min()
		copy[col] = (df[col]-min)/(max - min)
	return copy

### Data Cleaning and Normalization

In [None]:
df = pd.read_csv('daveData/train.csv')
df = df.dropna()
df = df.drop(columns={'Unnamed: 0', 'id'})

hotdf = pd.get_dummies(df)
hotdf = normalize(hotdf)
hotdf.head()

### Heatmap Generation

In [None]:
corr_matrix = hotdf.corr()
best_features = corr_matrix.index
plt.figure(figsize=(25,25))
hot = sns.heatmap(hotdf[best_features].corr(),annot=True,cmap="RdYlGn")

### RBF SVM

In [None]:
y = hotdf['satisfaction_neutral or dissatisfied']
x = hotdf.drop(columns={'satisfaction_neutral or dissatisfied', 'satisfaction_satisfied'})

# The dataset is so big that I need to cut it down
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.7)

x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2)

clf = svm.SVC(kernel='rbf', gamma=0.1)
clf.fit(x_train, y_train)
print("Done Fitting the Model")
y_pred = clf.predict(x_test)

p,r,f,s = precision_recall_fscore_support(y_test, y_pred)

print(f'Per: {p}')
print(f'Rec: {r}')
print(f'F1: {f}')

### Feature Importance
Note, this could take a while to run and is dependent upon the random splits above. However, Food and Drink is almost always in the top 10. This is the first thing that airlines can actually change/do proactivly. Inflight service is also a usual high feature.

In [None]:
perm_importance = permutation_importance(clf, x_test, y_test, n_jobs=-1, n_repeats=5)
print("Done with Importance Stuff")
sorted_idx = perm_importance.importances_mean.argsort()

for i in sorted_idx:
    print(x.columns[i])  

### Polynomial SVM

In [None]:
clf = svm.SVC(kernel='poly', degree=2)
clf.fit(x_train, y_train)
print("Done Fitting the Model")
y_pred = clf.predict(x_test)

p,r,f,s = precision_recall_fscore_support(y_test, y_pred)

print(f'Per: {p}')
print(f'Rec: {r}')
print(f'F1: {f}')

### Linear SVM
Increasing the class weight appears to increase the precision of our model. The recall suffers.

In [None]:
clf = svm.SVC(kernel='linear', class_weight={0:4})
clf.fit(x_train, y_train)
print("Done Fitting the Model")
y_pred = clf.predict(x_test)

p,r,f,s = precision_recall_fscore_support(y_test, y_pred)

print(f'Per: {p}')
print(f'Rec: {r}')
print(f'F1: {f}')

### Logistic Regression

In [None]:
lm = LogisticRegression()
lm.fit(x_train, y_train)
print(lm.score(x_train, y_train))

### Data Visualization

In [None]:
plotx = x_train[['Food and drink', 'Inflight service']]
ploty = y_train
color = ['r' if ploty_ == 0 else 'b' for ploty_ in y_train]
plt.scatter(x_train['Food and drink'], x_train['Inflight service'], c=color)
plt.xlabel("Food and Drink Rating")
plt.ylabel("Inflight Service Rating")
plt.savefig('FoodDrink.pdf')