In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
import pickle

# Load the dataset
file_path = '../data/instagram_profile_final.csv'  # Update this with your file path
data = pd.read_csv(file_path)

# Inspect the unique values in the 'class' column
print("Unique values in 'class':", data['class'].unique())

# Map class labels to binary (1: real, 0: fake)
class_mapping = {'r': 1, 'f': 0}  # Map 'r' (real) to 1 and 'f' (fake) to 0
data['class'] = data['class'].map(class_mapping)

# Drop rows with NaN values in 'class' (unmapped or missing)
data = data.dropna(subset=['class'])

# Check for and handle missing values in features
data = data.dropna()  # Drop rows with missing values in features

# Separate features and target
X = data.drop('class', axis=1)
y = data['class']

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train the Decision Tree model
dt_model_final = DecisionTreeClassifier(random_state=42)
dt_model_final.fit(X_train, y_train)

# Make predictions on test data
y_pred = dt_model_final.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Prediction Function
def predict_new_data(new_data):
    """
    Predict the class for new data points.

    Args:
        new_data (list or 2D array): New data points to predict.

    Returns:
        list: Predicted classes (1 for real, 0 for fake).
    """
    # Ensure new data has the same number of features as training data
    if len(new_data[0]) != X.shape[1]:
        raise ValueError(f"New data must have {X.shape[1]} features, but got {len(new_data[0])} features.")

    # Ensure new data is scaled
    new_data_scaled = scaler.transform(new_data)
    predictions = dt_model_final.predict(new_data_scaled)
    return predictions

# Example: Predict on new data
# Replace the values below with actual feature values (matching the feature count in training data)
example_data = [
    [0.5, 0.8, -0.3, 1.2, 0.7, 0.0, 0.1, 0.2, -0.1, 0.4, 0.5, 1.1, 0.6, 0.3, -0.2, 0.8, 0.9],  # Example data point 1 with 17 features
    [1.0, -0.5, 0.4, 0.6, -1.2, 0.8, 0.3, 0.4, 0.1, 0.9, 1.0, -0.4, 0.7, 0.2, 0.5, 0.9, -0.1]   # Example data point 2 with 17 features
]

# example_data = [
#     # Example 1: Features likely representing a "real" account
#     [1.0, 0.9, 0.7, 0.6, 1.2, 0.8, 0.6, -0.5, 0.3, 1.0, 0.9, 1.3, 1.2, 0.7, 0.1, 0.8, 0.9],  # High positive values in multiple features

#     # Example 2: Features likely representing a "real" account
#     [0.9, 1.1, 0.6, 0.5, 1.0, 0.7, 0.5, -0.4, 0.4, 0.8, 1.0, 1.1, 0.9, 0.6, 0.2, 0.7, 1.0],  # More balanced, but with a slight positive trend
# ]

predicted_classes = predict_new_data(example_data)
print("Predicted Classes for New Data:", predicted_classes)


with open("../models/decision_tree_model_final.pkl", "wb") as dt_file:
    pickle.dump(dt_model_final, dt_file)

print("Models saved successfully!")



Unique values in 'class': ['f' 'r' 's' 'i' 'a']
Accuracy: 0.8764491337762147

Classification Report:
               precision    recall  f1-score   support

         0.0       0.86      0.85      0.86      6613
         1.0       0.89      0.90      0.89      8741

    accuracy                           0.88     15354
   macro avg       0.87      0.87      0.87     15354
weighted avg       0.88      0.88      0.88     15354

Predicted Classes for New Data: [0. 0.]
Models saved successfully!


