In [None]:
# NOTE: Run 'python generate_data.py' before executing notebook to generate 'troop_movements.csv' file

import pandas as pd
import numpy as np

data = pd.read_csv('troop_movements.csv')

print("First few rows of the DataFrame:")
print(data.head())

In [None]:
#Showing Counts of Empire vs. Resistance
df_Count = pd.DataFrame(data)

print("\nSorting the DataFrame by empire_or_resistance:")
value_counts = df_Count['empire_or_resistance'].value_counts().reset_index()

print(value_counts.head())

In [None]:
unit_type_counts_size = data.groupby('unit_type').size()
print("Counts of characters by unit_type", unit_type_counts_size)


In [None]:
#Showing Counts of Characters By Homeworld
df_Count = pd.DataFrame(data)

print("\nSorting the DataFrame by homeworld:")
value_counts2 = df_Count['homeworld'].value_counts(ascending=True).reset_index()

print(value_counts2.head())


In [None]:
def is_resistance(row):
        if row['empire_or_resistance'] == "resistance":
            return True
        else:
            return False
data['is_resistance'] = data.apply(is_resistance, axis=1)
print(data.head())

In [None]:
! pip install seaborn
! pip install matplotlib

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


sns.set_theme(style="whitegrid")

sns.countplot(data=data, x="empire_or_resistance", hue= "empire_or_resistance")
plt.title('Character Count by Empire or Resistance')
plt.xlabel('Empire or Resistance')
plt.ylabel('Count')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier


X = data[['unit_type', 'homeworld']]  # Features: Unit_Type, Homeworld
y = data['empire_or_resistance']  # Target variable


categorical_features = ['unit_type', 'homeworld']
categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder())
])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)
    ])


decision_tree_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier())
])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


decision_tree_pipeline.fit(X_train, y_train)


y_pred = decision_tree_pipeline.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
# Converting categorical features to numerics
X_encoded = pd.get_dummies(X)
print("First rows of categorical features AS NUMERICS: \n", X_encoded.head())


In [None]:
# Get Feature Importance From numerics

# Get feature importances
importances = decision_tree_pipeline.named_steps['classifier'].feature_importances_

# Create a DataFrame to hold the feature importances
feature_importances = pd.DataFrame({'Feature': X_encoded.columns, 'Importance': importances})
print("Printing head of feature importances \n", feature_importances.head())

In [None]:
# Sort Feature Importance
sorted_importances = feature_importances.sort_values(by='Importance', ascending=False)
print("Printing first rows of feature importances - SORTED: \n", sorted_importances.head())

# Create boxplot using feature importance
barplot = sns.barplot(sorted_importances, x="Feature", y="Importance")

# Rotate x-axis label
barplot.set_xticklabels(barplot.get_xticklabels(), rotation=90, horizontalalignment='right')

# Add title to plot
plt.title("Feature Importance")

# Adjust layout to prevent labels from overlapping
plt.tight_layout()

# Show the plot
plt.show()