In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [19]:
df = pd.read_csv(r"C:\Users\Sujal Naik\Desktop\Task 1\screen_time.csv")  
print(df.head())  

   Age Gender Screen Time Type Day Type  Average Screen Time (hours)  \
0    5   Male      Educational  Weekday                         0.44   
1    5   Male     Recreational  Weekday                         1.11   
2    5   Male            Total  Weekday                         1.55   
3    5   Male      Educational  Weekend                         0.50   
4    5   Male     Recreational  Weekend                         1.44   

   Sample Size  
0          500  
1          500  
2          500  
3          500  
4          500  


In [21]:
numerical_features = ["Age", "Average Screen Time (hours)", "Sample Size"]
categorical_features = ["Gender", "Screen Time Type", "Day Type"]

# Create transformers
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),  # Fill missing values with mean
    ("scaler", StandardScaler())  # Scale features
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),  # Fill missing categorical values
    ("encoder", OneHotEncoder(handle_unknown="ignore"))  # One-hot encode categorical values
])

# Combine pipelines using ColumnTransformer
preprocessor = ColumnTransformer([
    ("num", num_pipeline, numerical_features),
    ("cat", cat_pipeline, categorical_features)
])

In [22]:
df_processed = preprocessor.fit_transform(df)
print(df_processed)  # View transformed data

[[-1.58113883 -1.33111777  1.58113883 ...  0.          1.
   0.        ]
 [-1.58113883 -0.98178823  1.58113883 ...  0.          1.
   0.        ]
 [-1.58113883 -0.75237778  1.58113883 ...  1.          1.
   0.        ]
 ...
 [ 1.58113883 -0.41868986 -1.58113883 ...  0.          0.
   1.        ]
 [ 1.58113883  1.44787694 -1.58113883 ...  0.          0.
   1.        ]
 [ 1.58113883  2.58971529 -1.58113883 ...  1.          0.
   1.        ]]


In [23]:
X_train, X_test = train_test_split(df, test_size=0.2, random_state=42)

In [24]:
pd.DataFrame(df_processed).to_csv("processed_data.csv", index=False)

In [25]:
def data_pipeline(input_file="screen_time.csv", output_file="processed_screen_time.csv"): #Setting default filenames relevant to your data
    try:
        # Load Data
        df = pd.read_csv(input_file)

        # Identify numerical and categorical columns for your data
        numerical_features = ["Age", "Average Screen Time (hours)", "Sample Size"]
        categorical_features = ["Gender", "Screen Time Type", "Day Type"]

        # Create transformers
        num_pipeline = Pipeline([
            ("imputer", SimpleImputer(strategy="mean")),  # Fill missing values with mean
            ("scaler", StandardScaler())  # Scale features
        ])

        cat_pipeline = Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),  # Fill missing categorical values
            ("encoder", OneHotEncoder(handle_unknown="ignore"))  # One-hot encode categorical values
        ])

        # Combine pipelines using ColumnTransformer
        preprocessor = ColumnTransformer([
            ("num", num_pipeline, numerical_features),
            ("cat", cat_pipeline, categorical_features)
        ])

        # Apply Preprocessing
        df_processed = preprocessor.fit_transform(df)

        # Save Transformed Data
        pd.DataFrame(df_processed, columns=preprocessor.get_feature_names_out()).to_csv(output_file, index=False) #Including Header in CSV

        print(f"Data pipeline executed successfully. Processed data saved to {output_file}")

    except FileNotFoundError:
        print(f"Error: File not found at '{input_file}'. Please check the file path.")
    except Exception as e:  # Catch other potential errors
        print(f"An error occurred: {e}")


# Run the pipeline
data_pipeline("C:/Users/Sujal Naik/Desktop/Task 1/screen_time.csv", "processed_screen_time.csv") #Providing full file path

Data pipeline executed successfully. Processed data saved to processed_screen_time.csv
