In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/average-time-spent-by-a-user-on-social-media/dummy_data.csv


In [2]:
import pandas as pd

# Replace with the actual file path of your CSV file printed in the previous code
file_path = '/kaggle/input/average-time-spent-by-a-user-on-social-media/dummy_data.csv'

# Load the CSV file into a pandas DataFrame
dp = pd.read_csv(file_path)

# View the first 5 rows of the DataFrame
print(dp.head())


   age      gender  time_spent   platform  interests        location  \
0   56        male           3  Instagram     Sports  United Kingdom   
1   46      female           2   Facebook     Travel  United Kingdom   
2   32        male           8  Instagram     Sports       Australia   
3   60  non-binary           5  Instagram     Travel  United Kingdom   
4   25        male           1  Instagram  Lifestlye       Australia   

  demographics         profession  income  indebt  isHomeOwner  Owns_Car  
0        Urban  Software Engineer   19774    True        False     False  
1        Urban            Student   10564    True         True      True  
2    Sub_Urban   Marketer Manager   13258   False        False     False  
3        Urban            Student   12500   False         True     False  
4        Urban  Software Engineer   14566   False         True      True  


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Create severity levels (0-3) based on time spent quartiles in ascending order
dp['severity'] = pd.qcut(dp['time_spent'], q=4, labels=[0, 1, 2, 3], retbins=True)[0]

# Feature Engineering
# Create interaction features (Example: 'age' * 'income')
dp['age_income_interaction'] = dp['age'] * dp['income']

# Define categorical and numerical features
categorical_features = ['gender', 'platform', 'location']
numerical_features = ['age', 'time_spent', 'income', 'age_income_interaction']

# Handle missing values and standardize numerical variables
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# One-hot encode categorical variables
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ]
)

# Prepare the features and target variable
X = dp.drop(columns='severity')
y = dp['severity']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit and transform the training data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)


In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Create the XGBoost model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# Create Logistic Regression and Random Forest models
lr_model = LogisticRegression(max_iter=200)
rf_model = RandomForestClassifier()

# Define hyperparameters for optimization (for XGBoost as an example)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}

# Grid search for hyperparameter tuning on XGBoost
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='accuracy', cv=3)
grid_search.fit(X_train_processed, y_train)

# Best model from XGBoost
best_xgb_model = grid_search.best_estimator_
print(f"Best XGBoost hyperparameters: {grid_search.best_params_}")

# Define the soft voting classifier combining XGBoost, Logistic Regression, and Random Forest
voting_clf = VotingClassifier(estimators=[
    ('xgb', best_xgb_model), 
    ('lr', lr_model), 
    ('rf', rf_model)], 
    voting='soft')  # soft voting averages the predicted probabilities

# Train the soft voting classifier
voting_clf.fit(X_train_processed, y_train)

# Evaluate on the training set
train_predictions = voting_clf.predict(X_train_processed)
train_accuracy = accuracy_score(y_train, train_predictions)
print(f"Training Accuracy: {train_accuracy:.2f}")

# Optionally, if you have a validation set, evaluate on it
# Split the training set into training and validation for further evaluation
X_train_final, X_val, y_train_final, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Fit the voting classifier on the new training set
voting_clf.fit(preprocessor.fit_transform(X_train_final), y_train_final)

# Validate the model
val_predictions = voting_clf.predict(preprocessor.transform(X_val))
val_accuracy = accuracy_score(y_val, val_predictions)
print(f"Validation Accuracy: {val_accuracy:.2f}")

# Evaluate on the test set
test_predictions = voting_clf.predict(X_test_processed)
test_accuracy = accuracy_score(y_test, test_predictions)
print(f"Testing Accuracy: {test_accuracy:.2f}")


Best XGBoost hyperparameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
Training Accuracy: 1.00
Validation Accuracy: 1.00
Testing Accuracy: 0.97


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Replace with actual file path
file_path = '/kaggle/input/average-time-spent-by-a-user-on-social-media/dummy_data.csv'

# Load data
dp = pd.read_csv(file_path)

# Define severity based on time spent using quantiles (Low, Moderate, High, Critical)
dp['severity'] = pd.qcut(dp['time_spent'], q=4, labels=['Low', 'Moderate', 'High', 'Critical'])

# Feature Engineering: Create interaction features (age * income)
dp['age_income_interaction'] = dp['age'] * dp['income']

# Define categorical and numerical features
categorical_features = ['gender', 'platform', 'location']
numerical_features = ['age', 'time_spent', 'income', 'age_income_interaction']

# Create pipelines for numerical and categorical features
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine pipelines into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ]
)

# Prepare the features and target variable
X = dp.drop(columns='severity')
y = dp['severity']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit and transform the training data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Create models: XGBoost, Logistic Regression, Random Forest
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
lr_model = LogisticRegression(max_iter=200)
rf_model = RandomForestClassifier()

# Define the soft voting classifier combining XGBoost, Logistic Regression, and Random Forest
voting_clf = VotingClassifier(estimators=[
    ('xgb', xgb_model), 
    ('lr', lr_model), 
    ('rf', rf_model)], 
    voting='soft')

# Train the voting classifier
voting_clf.fit(X_train_processed, y_train)

# Evaluate on the test set
test_predictions = voting_clf.predict(X_test_processed)
test_accuracy = accuracy_score(y_test, test_predictions)
print(f"Test Accuracy: {test_accuracy:.2f}")

# Print predictions for test set
print("Predictions:", test_predictions)
print("Actual:", y_test.values)


Test Accuracy: 1.00
Predictions: ['Moderate' 'Critical' 'Low' 'Critical' 'Low' 'Low' 'Low' 'Low' 'Critical'
 'Critical' 'High' 'Critical' 'Low' 'Low' 'High' 'Moderate' 'Low' 'Low'
 'Critical' 'Moderate' 'Critical' 'High' 'Low' 'Moderate' 'High' 'Low'
 'High' 'Low' 'Moderate' 'Moderate' 'Low' 'Low' 'Critical' 'High' 'Low'
 'High' 'Critical' 'High' 'Critical' 'Moderate' 'Low' 'Moderate'
 'Moderate' 'Moderate' 'Low' 'Critical' 'Low' 'Low' 'High' 'Critical'
 'Moderate' 'High' 'High' 'Moderate' 'Low' 'Low' 'Moderate' 'High' 'Low'
 'Moderate' 'High' 'High' 'Critical' 'Critical' 'Critical' 'Moderate'
 'High' 'Low' 'Low' 'Critical' 'Low' 'Low' 'Moderate' 'High' 'High' 'High'
 'Moderate' 'Moderate' 'Moderate' 'High' 'Moderate' 'Moderate' 'Low'
 'Moderate' 'Moderate' 'Moderate' 'Moderate' 'Low' 'Moderate' 'High'
 'Moderate' 'High' 'Low' 'Critical' 'High' 'Moderate' 'Critical' 'High'
 'Low' 'Critical' 'Moderate' 'Low' 'Critical' 'Low' 'Critical' 'Moderate'
 'Low' 'Critical' 'Moderate' 'Low' 'High

In [6]:
# Sample inputs corresponding to severity levels: Low, Moderate, High, Critical
sample_data = pd.DataFrame({
    'age': [18, 25, 35, 45],                 # Varied ages for diverse users
    'gender': ['Male', 'Female', 'Female', 'Male'],   # Gender diversity
    'time_spent': [1, 5, 7, 20],          # Time spent (Low, Moderate, High, Critical)
    'platform': ['Instagram', 'Twitter', 'Facebook', 'Snapchat'],  # Different social media platforms
    'location': ['Urban', 'Suburban', 'Rural', 'Urban'],           # Varied locations
    'income': [30000, 50000, 70000, 90000]   # Varied income levels for interaction feature
})

# Feature Engineering: Create interaction features (age * income)
sample_data['age_income_interaction'] = sample_data['age'] * sample_data['income']

# Preprocess the sample data
sample_data_processed = preprocessor.transform(sample_data)

# Predict the severity using the trained model
sample_predictions = voting_clf.predict(sample_data_processed)

# Print the results
print("Sample Input Data:")
print(sample_data)
print("Predicted Severity Levels:", sample_predictions)


Sample Input Data:
   age  gender  time_spent   platform  location  income  \
0   18    Male           1  Instagram     Urban   30000   
1   25  Female           5    Twitter  Suburban   50000   
2   35  Female           7   Facebook     Rural   70000   
3   45    Male          20   Snapchat     Urban   90000   

   age_income_interaction  
0                  540000  
1                 1250000  
2                 2450000  
3                 4050000  
Predicted Severity Levels: ['Low' 'Moderate' 'High' 'Critical']
