In [None]:
# --- EDA for roommates_clean_for_tableau.csv ---
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Load dataset
df = pd.read_csv("roommates_clean_for_tableau.csv")

sns.set(style="whitegrid", palette="muted", font_scale=1.1)

# -----------------
# 1. Gender distribution
# -----------------
plt.figure(figsize=(6,4))
sns.countplot(data=df, x="Gender", order=df["Gender"].value_counts().index, palette="Set2")
plt.title("Gender Distribution")
plt.show()

# -----------------
# 2. Age distribution
# -----------------
plt.figure(figsize=(8,4))
sns.histplot(df["Age"], bins=10, kde=True, color="skyblue")
plt.title("Age Distribution")
plt.show()

# -----------------
# 3. Top 10 Programs
# -----------------
plt.figure(figsize=(10,5))
top_programs = df["program_name"].value_counts().nlargest(10)
sns.barplot(x=top_programs.values, y=top_programs.index, palette="viridis")
plt.title("Top 10 Programs")
plt.xlabel("Students")
plt.ylabel("Program")
plt.show()

# -----------------
# 4. Roommate Preferences
# -----------------
plt.figure(figsize=(6,4))
sns.countplot(data=df, x="roommate_pref", order=df["roommate_pref"].value_counts().index, palette="pastel")
plt.title("Roommate Preferences")
plt.show()

# -----------------
# 5. Food Preferences
# -----------------
plt.figure(figsize=(6,4))
sns.countplot(data=df, x="food_pref", order=df["food_pref"].value_counts().index, palette="pastel")
plt.title("Food Preferences")
plt.show()

# -----------------
# 6. Smoker vs Non-Smoker
# -----------------
plt.figure(figsize=(6,4))
sns.countplot(data=df, x="Smoker", palette="coolwarm")
plt.title("Smoker Distribution")
plt.show()

# -----------------
# 7. Alcohol Friendly
# -----------------
plt.figure(figsize=(6,4))
sns.countplot(data=df, x="alcohol_friendly", order=df["alcohol_friendly"].value_counts().index, palette="Set1")
plt.title("Alcohol Friendly Preferences")
plt.show()

# -----------------
# 8. Loud Music Allowed
# -----------------
plt.figure(figsize=(6,4))
sns.countplot(data=df, x="loud_music_allowed", palette="muted")
plt.title("Loud Music Allowed")
plt.show()

# -----------------
# 9. ASU Campus Distribution (Plotly Pie)
# -----------------
fig = px.pie(df, names="asu_campus", title="ASU Campus Distribution", hole=0.3)
fig.show()

# -----------------
# 10. Top 10 States
# -----------------
plt.figure(figsize=(10,5))
top_states = df["State"].value_counts().nlargest(10)
sns.barplot(x=top_states.values, y=top_states.index, palette="mako")
plt.title("Top 10 States Represented")
plt.xlabel("Students")
plt.ylabel("State")
plt.show()

# -----------------
# 11. Correlation Heatmap
# -----------------
plt.figure(figsize=(8,6))
sns.heatmap(df.select_dtypes(include="number").corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap (Numerical Features)")
plt.show()