In [None]:
import pandas as pd

# Load the generated data from troop movements.csv into a Pandas DataFrame
df = pd.read_csv('troop_movements.csv')

print(df.head())

In [None]:
# Group data to show counts of empire vs resistance
empire_vs_resistance = df['empire_or_resistance'].value_counts().reset_index()
empire_vs_resistance.columns = ['empire_or_resistance','count']
print(empire_vs_resistance)
print("\n")

# Group data to show counts of characters by homeworld
characters_by_homeworld = df['homeworld'].value_counts().reset_index()
characters_by_homeworld.columns = ['homeworld','count']
print(characters_by_homeworld)
print("\n")

# Group data to show counts of characters by unit_type
characters_by_unit_type = df['unit_type'].value_counts()
characters_by_unit_type.columns = ['unit_type','count']
print(characters_by_unit_type)
print("\n")


# Engineer a new feature called is_resistance with True or False value based on empire_or_resistance
df['is_resistance'] = df['empire_or_resistance'] == 'resistance'
print(df['is_resistance'])


In [None]:

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the generated data
#df = pd.read_csv('troop_movements.csv')

# drop the timestamp column as it's not needed for the model and is causing errors
df.drop(columns=['timestamp'], inplace=True)

# Create a new feature 'is_resistance'
df['is_resistance'] = df['empire_or_resistance'] == 'resistance'

# Create a bar plot showing Empire vs Resistance distribution
sns.countplot(x='empire_or_resistance', data=df)
plt.title('Empire vs Resistance Distribution')
plt.xlabel('Faction')
plt.ylabel('Count')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# convert categorical features to numeric
df_encoded = pd.get_dummies(df, columns=['homeworld','unit_type'])

# Select features and target variable
features = df_encoded.drop(columns=['empire_or_resistance','is_resistance'])
target = df_encoded['is_resistance']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Create and train the model
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
accuracy = model.score(X_test, y_test)
print(f'Model Accuracy: {accuracy}')

In [None]:
# Get feature importances
importances = model.feature_importances_

# Create a DataFrame to hold the feature importances
feature_importances = pd.DataFrame({'Feature': features.columns, 'Importance': importances})

# Sort the DataFrame by importance
feature_importances = feature_importances.sort_values(by='Importance',ascending=False)

# Create a bar plot for feature importances
sns.barplot(x='Importance',y='Feature',data=feature_importances)
plt.title('feature importance')

plt.show()


In [None]:
import pickle

# Save the trained model as a pickle file

with open('trained_model.pkl','wb') as file:
    pickle.dump(model, file)


In [None]:
import pandas as pd
import pickle

# Step 1: Load the Real Data
df_real = pd.read_csv('troop_movements10m.csv')

# Step 2: Data Cleaning
# Replace 'invalid_unit' with 'unknown'
df_real['unit_type'] = df_real['unit_type'].replace('invalid_unit', 'unknown')

# Fill missing location_x and location_y values using forward fill method
df_real['location_x'].fillna(method='ffill', inplace=True)
df_real['location_y'].fillna(method='ffill', inplace=True)

# Save the cleaned data to a Parquet file
df_real.to_parquet('troop_movements10m.parquet', engine='pyarrow')

# Step 3: Load the Model and Predict
# Load the trained model
with open('trained_model.pkl', 'rb') as file:
    model = pickle.load(file)

# Load the cleaned data from the Parquet file
df_real_cleaned = pd.read_parquet('troop_movements10m.parquet')

# Convert the timestamp column to datetime
df_real_cleaned['timestamp'] = pd.to_datetime(df_real_cleaned['timestamp'])

# Extract useful components from the timestamp
df_real_cleaned['year'] = df_real_cleaned['timestamp'].dt.year
df_real_cleaned['month'] = df_real_cleaned['timestamp'].dt.month
df_real_cleaned['day'] = df_real_cleaned['timestamp'].dt.day
df_real_cleaned['hour'] = df_real_cleaned['timestamp'].dt.hour
df_real_cleaned['minute'] = df_real_cleaned['timestamp'].dt.minute
df_real_cleaned['second'] = df_real_cleaned['timestamp'].dt.second

# Drop the original timestamp column
df_real_cleaned.drop(columns=['timestamp'], inplace=True)

# Convert categorical features to numeric
df_real_encoded = pd.get_dummies(df_real_cleaned, columns=['homeworld', 'unit_type'])

# Ensure the features match those used in the trained model
# You may need to align the columns with the training data
features_real = df_real_encoded.reindex(columns=features.columns, fill_value=0)

# Use the model to make predictions
df_real_encoded['is_resistance_pred'] = model.predict(features_real)

# Add the predicted values to the DataFrame
df_real_cleaned['is_resistance_pred'] = df_real_encoded['is_resistance_pred']

# Show the first few rows of the DataFrame with predictions
print(df_real_cleaned.head())
