In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Read the CSV data
df = pd.read_csv('conference_responses.csv')

# Clean the data
df = df.dropna(subset=['Response'])  # Remove rows with NaN in Response column
responses = df['Response'].values

# One-hot encode the input features
feature_columns = ['Year of Study', 'Current Role in AIESEC', 'Department']
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X = encoder.fit_transform(df[feature_columns])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, responses, test_size=0.2, random_state=42)

# Create and train the logistic regression model
model = LogisticRegression(multi_class='ovr', max_iter=1000)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Function to predict responses for new data
def predict_responses(new_data):
    encoded_data = encoder.transform(new_data[feature_columns])
    return model.predict(encoded_data)

# Make predictions for all data
all_predictions = predict_responses(df)

# Calculate overall recommendation
yes_maybe_count = sum(r in ["Yes", "Maybe"] for r in all_predictions)
recommendation = "Host the conference" if yes_maybe_count >= len(all_predictions) * 0.5 else "Don't host the conference"

print(f"\nOverall recommendation based on predictions: {recommendation}")

# Additional insights
response_counts = pd.Series(all_predictions).value_counts()
print("\nPredicted Response distribution:")
print(response_counts)
print(f"\nTotal responses: {len(all_predictions)}")
print(f"Percentage of 'Yes' responses: {response_counts.get('Yes', 0) / len(all_predictions) * 100:.2f}%")
print(f"Percentage of 'Maybe' responses: {response_counts.get('Maybe', 0) / len(all_predictions) * 100:.2f}%")
print(f"Percentage of 'No' responses: {response_counts.get('No', 0) / len(all_predictions) * 100:.2f}%")

# Additional analysis
print("\nAdditional Analysis:")
print(f"Total unique respondents: {df['Email Address'].nunique()}")
print("\nActual Response by Year of Study:")
print(df.groupby('Year of Study')['Response'].value_counts(normalize=True).unstack().fillna(0) * 100)
print("\nActual Response by Department:")
print(df.groupby('Department')['Response'].value_counts(normalize=True).unstack().fillna(0) * 100)

KeyError: "['Department'] not in index"