In [None]:
### Cheatsheet
- import LONGNAME as short
    ```
    import numpy as np
    import pandas as pd
    ```
- csv into dataframe, df to csv
    ```
    dataset = [{"date": "2021-01-01", "ice_cream_type": 1, "topping": 1, "location": 1},]
    df = pd.DataFrame(dataset)  # or "df = pd.read_csv("data.csv")"
    df.to_csv("yum-yum-ice-cream.csv", index=False)
    ```
- check data
    ```
    # check data
    df.head()

    # .describe() produces all the numbers they need for the requirement, (but they can also do each metric individually)
    df.describe()

    # Output general info about the table, notice we have some null values in all of our features
    df.info()

    # Histogram to show all the data distributions including the target
    df.hist()

    # Investigate to see if any data are correlated positively or negatively
    df.corr()
    ```
- load data & check keys
    ```
    from sklearn import datasets
    iris = datasets.load_iris()

    # check keys
    iris.keys()

    # Create the iris `data` dataset as a dataframe and name the columns with `feature_names`
    df = pd.DataFrame(iris["data"], columns=iris["feature_names"])

    # Include the target as well
    df['target'] = iris["target"]
    ```
- clean & convert
    ```
    from datetime import datetime
    from sklearn.preprocessing import StandardScaler

    # Drop all null values
    df = df.dropna()


    # Change the date column to a datetime
    df.loc[:, "date"] = pd.to_datetime(df.loc[:, "date"])

    # Extract year, month, and day into separate columns
    df["year"] = df.date.dt.year
    df["month"] = df.date.dt.month
    df["day"] = df.date.dt.day


    # Change weather column to a category 
    df.loc[:, "weather"] = df["weather"].astype("category")

    # One hot encode the weather category to have individual features. Prefix with `weather`
    weather_one_hot_df = pd.get_dummies(df.weather, prefix="weather")

    # Add the one hot encoded values back to the df
    df[weather_one_hot_df.columns.tolist()] = weather_one_hot_df


    # Standarize feature values to have a zero mean
    scaler = StandardScaler()
    scaler.fit(df[all_features])
    df.loc[:, all_features] = scaler.transform(df[all_features])


    # Target values as an array to compare against supervised and unsupervised
    df["target"].to_numpy()
    ```
- create model, score, predict
    ```
    from sklearn.linear_model import LinearRegression

    # initialize and fit a linear regression model
    reg = LinearRegression().fit(df[iris["feature_names"]], df["target"])

    # Scoring of the linear regression model, but slighly deceiving since the iris dataset is classifying not regression
    reg.score(df[iris["feature_names"]], df["target"])

    # regression output floating point numbers
    reg.predict(df[iris["feature_names"]])


    from sklearn.linear_model import RidgeClassifier

    # Fit a ridge classifier, which matches with the problem space of being a classification problem
    clf = RidgeClassifier(alpha=3.0).fit(df[iris["feature_names"]], df["target"])

    # Score the model
    clf.score(df[iris["feature_names"]], df["target"])

    # Predict the class values for the dataset, these will look much better!
    clf.predict(df[iris["feature_names"]])
    ```
- split data
    ```
    import pandas as pd
    from sklearn.datasets import make_regression
    from sklearn.model_selection import train_test_split

    # Creating a regression dataset with 1000 samples, 5 feature columns, 2 which are actually useful, and 1 target column
    regression_dataset = make_regression(n_samples=1000, n_features=5, n_informative=2, n_targets=1, random_state=0)
    df = pd.DataFrame(regression_dataset[0])
    df["target"] = regression_dataset[1]

    # Create a train: 0.8 | test: 0.2 ratio dataset
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=0)

    # Create a train: 0.6 | validation: 0.2 ratio dataset
    df_train, df_val = train_test_split(df_train, test_size=0.25, random_state=0)

    # Final dataset sizes: train: 0.6, validation: 0.2, test: 0.2,
    # Output each shape to confirm the size of train/validation/test
    print(f"Train: {df_train.shape}")
    print(f"Validation: {df_val.shape}")
    print(f"Test: {df_test.shape}")
    ```
