In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree
import matplotlib.pyplot as plt
import pickle

In [None]:
# read generated data from csv file into a pandas dataframe
csv = "troop_movements.csv"
df = pd.read_csv(csv)

# Display the first few rows of the dataset
print(df.head())

In [None]:
# Create grouped data showing counts of empire vs. resistance
print(df.groupby('empire_or_resistance').size().reset_index(name='count'))

In [None]:
# Create grouped data showing counts of characters by homeworld
print(df.groupby('homeworld').size().reset_index(name='count'))

In [None]:
# Create grouped data showing counts of characters by unit_type
print(df.groupby('unit_type').size().reset_index(name='count'))

In [None]:
# check for null values first
df['empire_or_resistance'].isna().any()

In [None]:
# Engineer a new feature called is_resistance with a True or False value based on empire_or_resistance
# if empire, True
# if not, false
def is_resistance(col):
    if col == 'empire':
        return True
    else:
        return False
    
df['is_resistance'] = df['empire_or_resistance'].apply(is_resistance)

In [None]:
df.head()

In [None]:
# Create a bar plot using Seaborn showing Empire vs. Resistance distribution
import seaborn as sns

sns.countplot(x = 'empire_or_resistance', data = df)
plt.title('Character Count by Empire or Resistance')
plt.xlabel('Empire or Resistance')
plt.ylabel('Count')
 
# Show the plot
plt.show()

In [None]:
# Create a prediction model using sklearn.tree.DecisionTreeClassifier
# Predict if a character is joining either the Empire or the Resistance based on
# their homeworld and unit_type

# convert categorical features to numeric using pd.get_dummies
df_dummies = pd.get_dummies(df, columns=['homeworld', 'unit_type'], drop_first=True, dtype=int)

In [None]:
df_dummies

In [None]:
list(df_dummies.columns)

In [None]:
columns_to_drop = ['timestamp',
'unit_id',
'empire_or_resistance',
'location_x',
'location_y',
'destination_x',
'destination_y']
df_dummies.drop(columns=columns_to_drop, inplace=True)

In [None]:
df_dummies.columns

In [None]:
feature_cols = df_dummies.columns.drop('is_resistance')

In [None]:
X = df_dummies[feature_cols] # Features
y = df_dummies['is_resistance'] # Target variable

In [None]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # 70% training and 30% test

In [None]:
print(len(X_train))
print(len(X_test))
print(len(y_train))
print(len(y_test))

In [None]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)


In [None]:
#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [None]:
# How accurate are those predictions?
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

In [None]:
# plot the decision tree
plt.figure(figsize=(20,10))
tree.plot_tree(clf, feature_names=feature_cols, class_names=['Empire', 'Resistance'], filled=True)
plt.title("Decision Tree Visualization")
plt.show()

In [None]:
# Create a bar plot that shows feature importance

# get feature importances
importances = clf.feature_importances_

# create a df to hold feature importance
feature_importances = pd.DataFrame({'Feature': feature_cols, 'Importance': importances})

In [None]:
feature_importances

In [None]:
# Sort data in descending order
importances = feature_importances['Importance']
features = feature_importances['Feature']
sorted_categories = [x for _, x in sorted(zip(importances, features), reverse=True)]
# sorted_values = sorted(importances, reverse=True)

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x=features, y=importances, order=sorted_categories)
plt.title('Feature Importances')
plt.xlabel('Feature')
plt.ylabel('Importance')

#rotate x-axis labels
plt.xticks(rotation=90)
 
# Show the plot
plt.show()

In [None]:
# Save the model as a pickle file

# using a context manager create and open a file
# f = open('trained_model.pkl', 'wb')
# pickle.dump(clf, f)
# f.close()
with open('trained_model.pkl', 'wb') as f:
    pickle.dump(clf, f)

print(f"Model saved to pickle")

In [None]:
# read generated data from csv file into a pandas dataframe
troop_10m_csv = "troop_movements10m.csv"
df_10m = pd.read_csv(troop_10m_csv)

# Display the first few rows of the dataset
print(df_10m.head())

In [None]:
df_10m.columns

In [None]:
df_10m.isna().sum()

In [None]:
df_10m['unit_type'].unique()

In [None]:
# some unit_type records have a value of invalid_unit (replace that with 'unknown')

def replace_invalid_unit(col):
    if col == 'invalid_unit':
        return 'unknown'
    else:
        return col
    
df_10m['unit_type'] = df_10m['unit_type'].apply(replace_invalid_unit)

In [None]:
df_10m['unit_type'].unique()

In [None]:
# some location_x and location_y values are missing (use ffill method)
df_10m['location_x'] = df_10m['location_x'].ffill()
df_10m['location_y'] = df_10m['location_y'].ffill()

In [None]:
df_10m.isna().sum()

In [None]:
! pip install pyarrow

In [None]:
! pip install -U fastparquet

In [None]:
# save the clean data into a Parquet file 
parquet_file = 'troop_movements10m.parquet'
df_10m.to_parquet(parquet_file)

In [2]:
# load the pickled model
import pickle
with open('trained_model.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

In [7]:
# load the data from the Parquet file into a dataframe
import pandas as pd
clean_df = pd.read_parquet('troop_movements10m.parquet')
clean_df.head(5)

Unnamed: 0,timestamp,unit_id,unit_type,location_x,location_y,destination_x,destination_y,homeworld
0,2023-06-13 17:33:18,1,at-st,2.0,8.0,1,1,Glee Anselm
1,2023-06-13 17:33:17,2,tie_silencer,4.0,4.0,0,1,Trandosha
2,2023-06-13 17:33:16,3,at-at,0.0,3.0,6,1,Corellia
3,2023-06-13 17:33:15,4,tie_silencer,6.0,1.0,6,9,Shili
4,2023-06-13 17:33:14,5,tie_fighter,0.0,4.0,9,6,Muunilinst


In [10]:
new_df = clean_df[['homeworld', 'unit_type']]

In [11]:
# convert categorical features to numeric using pd.get_dummies
new_df_dummies = pd.get_dummies(new_df, columns=['homeworld', 'unit_type'], drop_first=True, dtype=int)

In [12]:
new_df_dummies.columns

Index(['homeworld_Aleen Minor', 'homeworld_Bestine IV', 'homeworld_Cerea',
       'homeworld_Champala', 'homeworld_Chandrila', 'homeworld_Concord Dawn',
       'homeworld_Corellia', 'homeworld_Dagobah', 'homeworld_Dathomir',
       'homeworld_Dorin', 'homeworld_Eriadu', 'homeworld_Glee Anselm',
       'homeworld_Haruun Kal', 'homeworld_Iktotch', 'homeworld_Iridonia',
       'homeworld_Kalee', 'homeworld_Kashyyyk', 'homeworld_Malastare',
       'homeworld_Mirial', 'homeworld_Mon Cala', 'homeworld_Muunilinst',
       'homeworld_Naboo', 'homeworld_Ojom', 'homeworld_Quermia',
       'homeworld_Rodia', 'homeworld_Ryloth', 'homeworld_Serenno',
       'homeworld_Shili', 'homeworld_Skako', 'homeworld_Socorro',
       'homeworld_Stewjon', 'homeworld_Sullust', 'homeworld_Tatooine',
       'homeworld_Tholoth', 'homeworld_Toydaria', 'homeworld_Trandosha',
       'homeworld_Troiken', 'homeworld_Tund', 'homeworld_Umbara',
       'homeworld_Vulpter', 'homeworld_Zolan', 'unit_type_at-st',
       'unit

In [13]:

X = new_df_dummies # Features

In [14]:
# run the data through the model
predictions = loaded_clf.predict(X)

In [15]:
# add the predicted values to the data frame
clean_df['predictions'] = predictions

In [16]:
clean_df.head()

Unnamed: 0,timestamp,unit_id,unit_type,location_x,location_y,destination_x,destination_y,homeworld,predictions
0,2023-06-13 17:33:18,1,at-st,2.0,8.0,1,1,Glee Anselm,False
1,2023-06-13 17:33:17,2,tie_silencer,4.0,4.0,0,1,Trandosha,False
2,2023-06-13 17:33:16,3,at-at,0.0,3.0,6,1,Corellia,True
3,2023-06-13 17:33:15,4,tie_silencer,6.0,1.0,6,9,Shili,True
4,2023-06-13 17:33:14,5,tie_fighter,0.0,4.0,9,6,Muunilinst,True
