In [None]:
import pandas as pd
train_data = pd.read_csv("phone_usage_india.csv")
train_data.rename(columns={"Data Usage (GB/month)":"data_usage(gb/mo)","Calls Duration (mins/day)":"calls_dur(min/d)","Number of Apps Installed":"installed_apps","Social Media Time (hrs/day)":"soical_media(h/d)","E-commerce Spend (INR/month)":"E-commerce(INR/m)","Streaming Time (hrs/day)":"streaming(hrs/d)","Screen Time (hrs/day)":"Scr_time(h/d)","Gaming Time (hrs/day)":"gaming(hrs/d)","Monthly Recharge Cost (INR)":"recharge(INR)"}, inplace=True)
train_data.drop(columns=["User ID"], axis=1, inplace=True)
# Function to correct OS
def fix_os(brand):
    return "iOS" if brand == "Apple" else "Android"

# Apply function to create a clean OS column
train_data["OS"] = train_data["Phone Brand"].apply(fix_os)

# Display the updated DataFrame
print(train_data)
train_data.drop_duplicates(inplace=True)
train_data.dropna(inplace=True)
train_data.columns
# Define numerical columns to check for outliers
num_cols = ['Scr_time(h/d)', 'data_usage(gb/mo)', 'calls_dur(min/d)', 'installed_apps',
            'soical_media(h/d)', 'E-commerce(INR/m)', 'streaming(hrs/d)', 'gaming(hrs/d)', 'recharge(INR)']

# Function to remove outliers using IQR
def remove_outliers_iqr(train_data, cols):
    for col in cols:
        Q1 = train_data[col].quantile(0.25)  # First quartile (25th percentile)
        Q3 = train_data[col].quantile(0.75)  # Third quartile (75th percentile)
        IQR = Q3 - Q1  # Interquartile Range
        lower_bound = Q1 - 1.5 * IQR  # Lower bound
        upper_bound = Q3 + 1.5 * IQR  # Upper bound
        
        # Remove outliers
        train_data = train_data[(train_data[col] >= lower_bound) & (train_data[col] <= upper_bound)]
    
    return train_data

train_data = remove_outliers_iqr(train_data, num_cols)
train_data.head(15)
train_data['Scr_time(min/d)'] = (train_data['Scr_time(h/d)'] * 60).astype(int)
train_data['soical_media(min/d)'] = (train_data['soical_media(h/d)'] * 60).astype(int)
train_data['streaming(min/d)'] = (train_data['streaming(hrs/d)'] * 60).astype(int)
train_data['gaming(min/d)'] = (train_data['gaming(hrs/d)'] * 60).astype(int)
import matplotlib.pyplot as plt
import seaborn as sns

# Bar plot: Primary Use vs Phone Brand
plt.figure(figsize=(12, 6))
sns.countplot(x=train_data["Phone Brand"], hue=train_data["Primary Use"], palette="Set2")
plt.xlabel("Phone Brand")
plt.ylabel("Count")
plt.title("Primary Use by Phone Brand")
plt.xticks(rotation=45)
plt.legend(title="Primary Use")
plt.show()
apple_samsung_df = train_data[train_data["Phone Brand"].isin(["Apple", "Samsung"])]
plt.figure(figsize=(10, 6))
sns.countplot(x=apple_samsung_df["Location"], hue=apple_samsung_df["Phone Brand"], palette="coolwarm")
plt.xlabel("Location")
plt.ylabel("Count")
plt.title("Apple and Samsung Users by Location")
plt.xticks(rotation=45)
plt.legend(title="Phone Brand")
plt.show()
phone_brand_counts = train_data["Primary Use"].value_counts()
plt.figure(figsize=(8, 8))
plt.pie(phone_brand_counts, labels=phone_brand_counts.index, autopct='%1.1f%%', colors=sns.color_palette("pastel"))
plt.title("Primary Use Distribution")
plt.show()

phone_brand_counts = train_data["OS"].value_counts()
plt.figure(figsize=(8, 8))
plt.pie(phone_brand_counts, labels=phone_brand_counts.index, autopct='%1.1f%%', colors=sns.color_palette("pastel"))
plt.title("Operating System Distribution")
plt.show()
# Bar plot: Average Screen Time by Gender
plt.figure(figsize=(8, 6))
sns.barplot(x=train_data["Gender"], y=train_data["Scr_time(h/d)"], palette="coolwarm")
plt.xlabel("Gender")
plt.ylabel("Average Screen Time (hrs/day)")
plt.title("Average Screen Time by Gender")
plt.show()

train_data.drop(columns=["Scr_time(h/d)","soical_media(h/d)","streaming(hrs/d)","gaming(hrs/d)"], axis=1, inplace=True)
train_data.replace({"Female":1,"Male":2,"Other":3}, inplace=True)
train_data.replace({"Android":1,"iOS":0}, inplace=True)
train_data = train_data.join(pd.get_dummies(train_data["Location"], prefix="location", drop_first=True, dtype=int))
train_data = train_data.join(pd.get_dummies(train_data["Phone Brand"], prefix="Phone_Brand", drop_first=True, dtype=int))
train_data.drop(columns=["Location","Phone Brand"], inplace=True)
train_data["Primary Use"].value_counts(normalize=True)
mapping = {'Education': 1, 'Gaming': 2, 'Work': 3, "Social Media": 4, "Entertainment": 5}
train_data['Primary Use'] = train_data['Primary Use'].map(mapping)
train_data.columns
train_data['calls_dur(min/d)'] = train_data['calls_dur(min/d)'].round().astype(int)
train_data['data_usage(gb/mo)'] = train_data['data_usage(gb/mo)'].round().astype(int)
data_train = train_data
data_train
import pandas as pd
test_data = pd.read_csv("phone_usage_india.csv")
test_data.rename(columns={"Data Usage (GB/month)":"data_usage(gb/mo)","Calls Duration (mins/day)":"calls_dur(min/d)","Number of Apps Installed":"installed_apps","Social Media Time (hrs/day)":"soical_media(h/d)","E-commerce Spend (INR/month)":"E-commerce(INR/m)","Streaming Time (hrs/day)":"streaming(hrs/d)","Screen Time (hrs/day)":"Scr_time(h/d)","Gaming Time (hrs/day)":"gaming(hrs/d)","Monthly Recharge Cost (INR)":"recharge(INR)"}, inplace=True)
test_data.drop(columns=["User ID"], axis=1, inplace=True)
# Function to correct OS
def fix_os(brand):
    return "iOS" if brand == "Apple" else "Android"

# Apply function to create a clean OS column
test_data["OS"] = test_data["Phone Brand"].apply(fix_os)

# Display the updated DataFrame
print(test_data)
test_data.dropna()
test_data.columns

# Define numerical columns to check for outliers
num_cols = ['Scr_time(h/d)', 'data_usage(gb/mo)', 'calls_dur(min/d)', 'installed_apps',
            'soical_media(h/d)', 'E-commerce(INR/m)', 'streaming(hrs/d)', 'gaming(hrs/d)', 'recharge(INR)']

# Function to remove outliers using IQR
def remove_outliers_iqr(test_data, cols):
    for col in cols:
        Q1 = test_data[col].quantile(0.25)  # First quartile (25th percentile)
        Q3 = test_data[col].quantile(0.75)  # Third quartile (75th percentile)
        IQR = Q3 - Q1  # Interquartile Range
        lower_bound = Q1 - 1.5 * IQR  # Lower bound
        upper_bound = Q3 + 1.5 * IQR  # Upper bound
        
        # Remove outliers
        test_data = test_data[(test_data[col] >= lower_bound) & (test_data[col] <= upper_bound)]
    
    return test_data

test_data = remove_outliers_iqr(test_data, num_cols)
test_data['Scr_time(min/d)'] = (test_data['Scr_time(h/d)'] * 60).astype(int)
test_data['soical_media(min/d)'] = (test_data['soical_media(h/d)'] * 60).astype(int)
test_data['streaming(min/d)'] = (test_data['streaming(hrs/d)'] * 60).astype(int)
test_data['gaming(min/d)'] = (test_data['gaming(hrs/d)'] * 60).astype(int)
import matplotlib.pyplot as plt
import seaborn as sns

test_data.drop(columns=["Scr_time(h/d)","soical_media(h/d)","streaming(hrs/d)","gaming(hrs/d)"], axis=1, inplace=True)
test_data.replace({"Female":1,"Male":0,"Other":2}, inplace=True)
test_data.replace({"Android":1,"iOS":0}, inplace=True)
test_data = test_data.join(pd.get_dummies(test_data["Location"], prefix="location", drop_first=True, dtype=int))
test_data = test_data.join(pd.get_dummies(test_data["Phone Brand"], prefix="Phone_Brand", drop_first=True, dtype=int))
test_data.drop(columns=["Location","Phone Brand"], inplace=True)
test_data["Primary Use"].value_counts(normalize=True)
test_data.isnull().sum().sum()
test_data.drop_duplicates(inplace=True)
test_data.dropna(inplace=True)
mapping = {'Education': 0, 'Gaming': 1, 'Work': 2, "Social Media": 3, "Entertainment": 4}
test_data['Primary Use'] = test_data['Primary Use'].map(mapping)
test_data.columns
test_data['calls_dur(min/d)'] = test_data['calls_dur(min/d)'].round().astype(int)
test_data['data_usage(gb/mo)'] = test_data['data_usage(gb/mo)'].round().astype(int)
data_test = test_data
data_test
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

X = data_train.drop("Primary Use", axis=1)
y = data_train["Primary Use"]

# #Finds correlation between Independent and dependent attributes
# import matplotlib.pyplot as plt
# import seaborn as sns

# plt.figure(figsize = (18,18))
# sns.heatmap(data_train.corr(), annot = True, cmap = "RdYlGn")

# plt.show()
from sklearn.ensemble import ExtraTreesClassifier
selection = ExtraTreesClassifier()
selection.fit(X,y)
# plt.figure(figsize=(12,8))
# feat_importance = pd.Series(selection.feature_importances_,index=X.columns)
# feat_importance.nlargest(20).plot(kind='barh')
# plt.show()
# import seaborn as sns
# import matplotlib.pyplot as plt

# sns.countplot(x=train_data["Primary Use"])
# plt.show()
<!-- ## Fitting model using Random Forest -->
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.20, random_state=42)
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

# Train initial model to get feature importance
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Get feature importance
importances = rf.feature_importances_
feature_names = X_train.columns

# Convert to Pandas Series for better visualization
important_features = pd.Series(importances, index=feature_names).sort_values(ascending=False)

# Display top important features
print("Feature Importance:\n", important_features)
# Define threshold (e.g., drop features with importance < 0.02)
threshold = 0.02
selected_features = important_features[important_features > threshold].index

# Keep only important features
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

print(f"Selected {len(selected_features)} features out of {X_train.shape[1]}")

# Train Random Forest with selected features
rf_selected = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
rf_selected.fit(X_train_selected, y_train)

# Evaluate the model
train_acc = rf_selected.score(X_train_selected, y_train)
test_acc = rf_selected.score(X_test_selected, y_test)

print(f"Train Accuracy after feature selection: {train_acc:.4f}")
print(f"Test Accuracy after feature selection: {test_acc:.4f}")

from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 3, 5]
}

# Perform Grid Search
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model
best_rf = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

# Evaluate
train_acc = best_rf.score(X_train, y_train)
test_acc = best_rf.score(X_test, y_test)

print(f"Optimized Train Accuracy: {train_acc:.4f}")
print(f"Optimized Test Accuracy: {test_acc:.4f}")

# from sklearn.ensemble import RandomForestClassifier
# import numpy as np

# # Train a basic model and check feature importances
# rf = RandomForestClassifier(n_estimators=100, random_state=42)
# rf.fit(X_train, y_train)

# # Get feature importance
# importances = rf.feature_importances_
# feature_names = X.columns
# important_features = pd.Series(importances, index=feature_names).sort_values(ascending=False)

# # Display top important features
# print(important_features.head(10))

# print(data_train.isnull().sum())  # Look for missing values
print(data_train.describe())      # Check for outliers

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

model_reg_cl = RandomForestClassifier(n_estimators=100, random_state=42)
model_reg_cl.fit(X_train, y_train)

# Get feature importance
# feature_importance = pd.Series(model_reg_cl.feature_importances_, index=X_train.columns)
# feature_importance.nlargest(10).plot(kind='barh')
# plt.show()    
model_reg_cl.score(X_train, y_train)
model_reg_cl.score(X_test, y_test)
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import GridSearchCV

# # Define the hyperparameter grid
# param_grid = {
#     'n_estimators': [100, 200, 300],  # Number of trees
#     'max_depth': [10, 20, 30, None],  # Depth of trees
#     'min_samples_split': [2, 5, 10],  # Minimum samples to split
#     'min_samples_leaf': [1, 2, 4],  # Minimum samples per leaf
#     'max_features': ['sqrt', 'log2'],  # Number of features per split
#     'bootstrap': [True, False]  # Whether to use bootstrapping
# }

# # Initialize Random Forest
# rf = RandomForestClassifier(random_state=42)

# # Grid Search
# grid_search = GridSearchCV(rf, param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
# grid_search.fit(X_train, y_train)

# # Print best parameters
# print("Best Hyperparameters:", grid_search.best_params_)
# best_model = grid_search.best_estimator_  # If using GridSearchCV
# # best_model = random_search.best_estimator_  # If using RandomizedSearchCV

# from sklearn.metrics import accuracy_score

# # Predictions
# y_train_pred = best_model.predict(X_train)
# y_test_pred = best_model.predict(X_test)

# # Accuracy scores
# train_score = accuracy_score(y_train, y_train_pred)
# test_score = accuracy_score(y_test, y_test_pred)

# print(f"Training Accuracy: {train_score:.4f}")
# print(f"Testing Accuracy: {test_score:.4f}")

## Logistic Regression classification
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Initialize and train model
log_model = LogisticRegression(max_iter=500, random_state=42)
log_model.fit(X_train, y_train)

# Predictions
y_train_pred = log_model.predict(X_train)
y_test_pred = log_model.predict(X_test)
# Accuracy
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

print(f"Training Accuracy: {train_acc:.4f}")
print(f"Testing Accuracy: {test_acc:.4f}")

# Classification Report
print("\nClassification Report on Test Data:")
print(classification_report(y_test, y_test_pred))

# Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))

from sklearn.tree import DecisionTreeClassifier

# Initialize and train the model
dt_model = DecisionTreeClassifier(criterion='gini', max_depth=10, random_state=42)
dt_model.fit(X_train, y_train)

# Predictions
y_train_pred = dt_model.predict(X_train)
y_test_pred = dt_model.predict(X_test)
# Initialize and train model
log_model = LogisticRegression(max_iter=500, random_state=42)
log_model.fit(X_train, y_train)

# Predictions
y_train_pred = log_model.predict(X_train)
y_test_pred = log_model.predict(X_test)

# Accuracy
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

print(f"Training Accuracy: {train_acc:.4f}")
print(f"Testing Accuracy: {test_acc:.4f}")

# Classification Report
print("\nClassification Report on Test Data:")
print(classification_report(y_test, y_test_pred))

# Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))

from xgboost import XGBClassifier

# Initialize and train XGBoost model
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
xgb_model.fit(X_train, y_train)

# Predictions
y_train_pred = xgb_model.predict(X_train)
y_test_pred = xgb_model.predict(X_test)
# Accuracy
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

print(f"Training Accuracy: {train_acc:.4f}")
print(f"Testing Accuracy: {test_acc:.4f}")

# Classification Report
print("\nClassification Report on Test Data:")
print(classification_report(y_test, y_test_pred))

# Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))

from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

# Perform Grid Search
grid_search = GridSearchCV(XGBClassifier(random_state=42), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print best parameters
print("Best Hyperparameters:", grid_search.best_params_)

# Train best model
best_xgb = grid_search.best_estimator_
best_xgb.fit(X_train, y_train)

# Evaluate new model
print("Improved Test Accuracy:", accuracy_score(y_test, best_xgb.predict(X_test)))
