<a href="https://colab.research.google.com/github/sujataprasad01/Cognizant-AI-Program-Project2/blob/main/python_module.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [46]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from datetime import datetime

# Load data
def load_data(path: str = "/path/to/csv/"):
    """
    This function takes a path string to a CSV file and loads it into
    a Pandas DataFrame.

    :param      path (optional): str, relative path of the CSV file

    :return     df: pd.DataFrame
    """

    df = pd.read_csv(f"{path}")
    df.drop(columns=["Unnamed: 0"], inplace=True, errors='ignore')
    return df

# Create target variable and predictor variables
def create_target_and_predictors(
    data: pd.DataFrame = None,
    target: str = "estimated_stock_pct"
):
    """
    This function takes in a Pandas DataFrame and splits the columns
    into a target column and a set of predictor variables, i.e. X & y.
    These two splits of the data will be used to train a supervised
    machine learning model.

    :param      data: pd.DataFrame, dataframe containing data for the
                      model
    :param      target: str (optional), target variable that you want to predict

    :return     X: pd.DataFrame
                y: pd.Series
    """

    # Check to see if the target variable is present in the data
    if target not in data.columns:
        raise Exception(f"Target: {target} is not present in the data")

    X = data.drop(columns=[target])
    y = data[target]
    return X, y

# Train algorithm
def train_algorithm_with_cross_validation(
    X: pd.DataFrame = None,
    y: pd.Series = None
):
    """
    This function takes the predictor and target variables and
    trains a Random Forest Regressor model across K folds. Using
    cross-validation, performance metrics will be output for each
    fold during training.

    :param      X: pd.DataFrame, predictor variables
    :param      y: pd.Series, target variable

    :return
    """

    # Create a list that will store the accuracies of each fold
    accuracy = []

    # Enter a loop to run K folds of cross-validation
    k=10
    split=0.75
    for fold in range(0, k):

        # Instantiate algorithm and scaler
        model = RandomForestRegressor()
        scaler = StandardScaler()

        # Create training and test samples
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=split, random_state=42)

        # Scale X data, we scale the data because it helps the algorithm to converge
        # and helps the algorithm to not be greedy with large values
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

        # Train model
        trained_model = model.fit(X_train, y_train)

        # Generate predictions on test sample
        y_pred = trained_model.predict(X_test)

        # Compute accuracy, using mean absolute error
        mae = mean_absolute_error(y_true=y_test, y_pred=y_pred)
        accuracy.append(mae)
        print(f"Fold {fold + 1}: MAE = {mae:.3f}")

    # Finish by computing the average MAE across all folds
    print(f"Average MAE: {(sum(accuracy) / len(accuracy)):.2f}")


In [47]:
data=load_data("/content/drive/MyDrive/Colab Notebooks/Cognizant Internship/4.3 sensor_stock_levels.csv")

In [61]:
data.head()

Unnamed: 0,id,timestamp,product_id,estimated_stock_pct
0,4220e505-c247-478d-9831-6b9f87a4488a,2022-03-07 12:13:02,f658605e-75f3-4fed-a655-c0903f344427,0.75
1,f2612b26-fc82-49ea-8940-0751fdd4d9ef,2022-03-07 16:39:46,de06083a-f5c0-451d-b2f4-9ab88b52609d,0.48
2,989a287f-67e6-4478-aa49-c3a35dac0e2e,2022-03-01 18:17:43,ce8f3a04-d1a4-43b1-a7c2-fa1b8e7674c8,0.58
3,af8e5683-d247-46ac-9909-1a77bdebefb2,2022-03-02 14:29:09,c21e3ba9-92a3-4745-92c2-6faef73223f7,0.79
4,08a32247-3f44-4002-85fb-c198434dd4bb,2022-03-02 13:46:18,7f478817-aa5b-44e9-9059-8045228c9eb0,0.22


In [64]:
data["timestamp"]=pd.to_datetime(data["timestamp"], format='%Y-%m-%d %H:%M:%S')

new_ts=[]
for i in data["timestamp"]:
  new_ts.append(i.strftime('%Y-%m-%d %H:00:00'))

data["timestamp"]=[datetime.strptime(i, '%Y-%m-%d %H:00:00') for i in new_ts]

data['timestamp_monthDay']=data['timestamp'].dt.day
data['timestamp_weekDay']=data['timestamp'].dt.day_of_week
data['timestamp_hour']=data['timestamp'].dt.hour
data.drop(columns=['timestamp'], inplace=True)

In [66]:
data.head()

Unnamed: 0,id,product_id,estimated_stock_pct,timestamp_monthDay,timestamp_weekDay,timestamp_hour
0,4220e505-c247-478d-9831-6b9f87a4488a,f658605e-75f3-4fed-a655-c0903f344427,0.75,7,0,12
1,f2612b26-fc82-49ea-8940-0751fdd4d9ef,de06083a-f5c0-451d-b2f4-9ab88b52609d,0.48,7,0,16
2,989a287f-67e6-4478-aa49-c3a35dac0e2e,ce8f3a04-d1a4-43b1-a7c2-fa1b8e7674c8,0.58,1,1,18
3,af8e5683-d247-46ac-9909-1a77bdebefb2,c21e3ba9-92a3-4745-92c2-6faef73223f7,0.79,2,2,14
4,08a32247-3f44-4002-85fb-c198434dd4bb,7f478817-aa5b-44e9-9059-8045228c9eb0,0.22,2,2,13


In [67]:
X,y=create_target_and_predictors(data=data,
    target=  "estimated_stock_pct")

In [68]:
X.drop(columns=['product_id', 'id'], inplace=True)
y.drop(columns=['product_id', 'id'], inplace=True)

In [70]:
X.head()

Unnamed: 0,timestamp_monthDay,timestamp_weekDay,timestamp_hour
0,7,0,12
1,7,0,16
2,1,1,18
3,2,2,14
4,2,2,13


In [71]:
train_algorithm_with_cross_validation(X=X, y=y)

Fold 1: MAE = 0.250
Fold 2: MAE = 0.250
Fold 3: MAE = 0.250
Fold 4: MAE = 0.250
Fold 5: MAE = 0.250
Fold 6: MAE = 0.250
Fold 7: MAE = 0.250
Fold 8: MAE = 0.250
Fold 9: MAE = 0.250
Fold 10: MAE = 0.250
Average MAE: 0.25
