# Regression Template
v1.0

### Imports

In [1204]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style("whitegrid")

### Read Functions

In [1205]:
def show_data(df):
    """
    Display the first two and the last two records of a DataFrame
    """
    print(pd.concat([df.head(2), df.tail(2)]))


def show_missing_data(df):
    """
    Display number and percentage of missing values in all columns
    """
    total = df.isnull().sum().sort_values(ascending=False)
    percent = (
        ((df.isnull().sum() / df.isnull().count()) * 100)
        .sort_values(ascending=False)
        .round(2)
    )
    missing_data = pd.concat([total, percent], axis=1, keys=["# missing", "% missing"])
    print(missing_data)


def show_unique_values(df, fields):
    """
    Show unique values in DataFrame given a list of fields
    """
    for field in fields:
        try:
            print(f"{field}: {df[field].unique()}")
        except KeyError:
            print(f"`{field}` not found in DataFrame")

### Write Functions

In [1206]:
from sklearn.impute import SimpleImputer


def update_null_values(df, fields, strategy, fill_value=np.nan):
    """
    Update values with a given strategy
    @TODO: values for `strategy`
    `fill_value` only applies when strategy='constant'
    {'constant', 'most_frequent', 'mean', 'median'}
    """
    try:
        imputer = SimpleImputer(
            missing_values=np.nan, strategy=strategy, fill_value=fill_value
        )
        imputer.fit(df[fields])
        df_transformed = df.copy()
        df_transformed[fields] = imputer.transform(df[fields])
        return df_transformed
    except ValueError as e:
        print(f"❌ Error: {e}")
        return df

In [1207]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder


def encode_categorical_data(df, fields, encoder):
    """
    Function to encode categorical data in a DataFrame:
    - OneHot: tbd
    - Dummy: tbd
    - Label: tbd
    @TODO: explain when applying each one
    """
    try:
        if encoder == "OneHot":
            # Create a ColumnTransformer, applying OneHotEncoder to specified fields
            ct = ColumnTransformer(
                transformers=[("encoder", OneHotEncoder(), fields)],
                remainder="passthrough",
            )
            # Apply ColumnTransformer, resulting in an array
            transformed_data = ct.fit_transform(df)
            # Create new column names for the one-hot encoded columns
            encoded_columns = ct.named_transformers_["encoder"].get_feature_names_out(
                fields
            )
            # Combine the new column names with the non-transformed columns
            non_transformed_columns = [col for col in df.columns if col not in fields]
            new_column_names = list(encoded_columns) + non_transformed_columns
            # Create a DataFrame from the transformed data
            df_transformed = pd.DataFrame(
                transformed_data, columns=new_column_names, index=df.index
            )

        elif encoder == "Dummy":
            # Create dummy variables
            dummies = pd.get_dummies(df[fields], drop_first=True)
            # Drop the original fields and concatenate the dummy variables
            df_transformed = pd.concat([df.drop(fields, axis=1), dummies], axis=1)

        elif encoder == "Label":
            df_transformed = df.copy()
            # update original target fields with 0-N categorical values
            for field in fields:
                le = LabelEncoder()
                df_transformed[field] = le.fit_transform(df_transformed[field])
        else:
            print(f"encoder `{encoder}` not found")
            return df

        return df_transformed
    except KeyError as e:
        print(f"❌ Error: {e}")
        return df


In [1208]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()


def scale_features(X_train, X_test, fields):
    """
    - Only for non-dummy numerical features
    - For KNN, SVM or Logistic Reg/Linear Reg/NN with Gradient descent optimisation
    """
    try:
        # Create copies of the original DataFrames
        X_train_scaled, X_test_scaled = X_train.copy(), X_test.copy()

        # Scale only the specified fields
        X_train_scaled[fields] = sc.fit_transform(X_train[fields])
        X_test_scaled[fields] = sc.transform(X_test[fields])

        return X_train_scaled, X_test_scaled
    except Exception as e:
        print(f"❌ Error: {e}")
        return X_train, X_test

### Model Functions

In [1209]:
# NOT FOR CLASSIFICATION! from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

def fit_logistic_regression(X_train, y_train):
    lr = LogisticRegression(max_iter=1000)
    lr.fit(X_train, y_train)
    return lr

### Data Collection

In [1210]:
df = pd.read_csv('../src/v1/07_scikit-learn/filez/titanic_train.csv')
show_data(df)

     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
889                              Behr, Mr. Karl Howell    male  26.0      0   
890                                Dooley, Mr. Patrick    male  32.0      0   

     Parch     Ticket     Fare Cabin Embarked  
0        0  A/5 21171   7.2500   NaN        S  
1        0   PC 17599  71.2833   C85        C  
889      0     111369  30.0000  C148        C  
890      0     370376   7.7500   NaN        Q  


- field_1: description_1. Explanation.
- field_2: description_2. Explanation.
- field_3: description_3. Explanation.
- field_4: description_4 (0 = No, 1 = Yes). Explanation.

### Exploratory Data Analysis

In [1211]:
# Display DataFrame info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [1212]:
# Display number and percentage of missing values
show_missing_data(df)

             # missing  % missing
Cabin              687      77.10
Age                177      19.87
Embarked             2       0.22
PassengerId          0       0.00
Survived             0       0.00
Pclass               0       0.00
Name                 0       0.00
Sex                  0       0.00
SibSp                0       0.00
Parch                0       0.00
Ticket               0       0.00
Fare                 0       0.00


In [1213]:
# Describe statistics on numerical fields
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [1214]:
# show unique values given a list of df fields
show_unique_values(df, ['Sex', 'Embarked'])

Sex: ['male' 'female']
Embarked: ['S' 'C' 'Q' nan]


In [1215]:
#@TODO: seaborn charts

### Data cleaning & Preprocessing

    TO-BE-REMOVED
- remove or update null values
- manage outliers
- drop irrelevant fields (i.e.: ids, names, ..)
- correct data entry errors

In [1216]:
# Update null values
df = update_null_values(df=df, strategy='mean', fields=['Age'], fill_value=0)
# Remove unnecessary fields
df = df.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1)

### Encoding categorical data

In [1217]:
# OneHot encoder - binary values / keep all values
# df = encode_categorical_data(df=df, fields=['Embarked', 'Sex'], encoder='OneHot')
# Dummy encoder - binary values / remove first value
df = encode_categorical_data(df=df, fields=['Embarked', 'Sex'], encoder='Dummy')
# Label encoder - integer values 0-N / keep all values
# df = encode_categorical_data(df=df, fields=['Embarked', 'Sex'], encoder='Label')
df.head(2)


Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S,Sex_male
0,0,3,22.0,1,0,7.25,False,True,True
1,1,1,38.0,1,0,71.2833,False,False,False


### Splitting dataset into Train/Test set

In [1218]:
from sklearn.model_selection import train_test_split

X = df.drop("Survived", axis=1)
y = df["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

### Feature scaling

In [1219]:
# X_train_scaled, X_test_scaled = scale_features(
#     X_train=X_train, X_test=X_test, fields=["Pclass", "Age", "SibSp"]
# )

# X_train_scaled.head(2)
y_train


733    0
857    1
81     1
319    1
720    1
      ..
575    0
838    1
337    1
523    1
863    0
Name: Survived, Length: 712, dtype: int64

### Fit Models

In [1220]:
lr = fit_logistic_regression(X_train=X_train, y_train=y_train)
predictions = lr.predict(X_test)

### Predictions

In [1221]:
from sklearn.metrics import classification_report

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.79      0.88      0.83        99
           1       0.83      0.71      0.77        80

    accuracy                           0.80       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.80      0.80       179



TODO: use new df's after every change, but beware of the mem space required.

1. **Initial Stages**:
   - `df_raw`: The original, unmodified dataset.
   - `df_loaded`: Data after initial loading, possibly from multiple sources.

2. **Cleaning and Preprocessing**:
   - `df_cleaned`: After basic cleaning (removing duplicates, handling missing values).
   - `df_filtered`: Data after filtering based on certain criteria.
   - `df_imputed`: Where missing values have been imputed.
   - `df_deduped`: After removing duplicates.

3. **Feature Engineering**:
   - `df_engineered`: After feature engineering (new features created).
   - `df_transformed`: After applying transformations (log, square root, etc.).
   - `df_normalized`: If the data has been normalized.
   - `df_standardized`: If the data has been standardized.

4. **Encoding and Formatting**:
   - `df_encoded`: After encoding categorical variables (one-hot, label encoding).
   - `df_binned`: After binning continuous variables.
   - `df_pivoted`: If data has been pivoted or reshaped.
   - `df_aggregated`: After aggregation operations (group by, etc.).

5. **Splitting**:
   - `df_train`: Training set.
   - `df_test`: Test set.
   - `df_validate`: Validation set.

6. **Modeling**:
   - `df_predictions`: Contains model predictions.
   - `df_residuals`: Residuals from model predictions.
   - `df_analyzed`: DataFrames used for deeper analysis post-modeling.

7. **Results and Export**:
   - `df_results`: Final results or outputs.
   - `df_export`: Data ready to be exported to a file or database.

8. **Special Cases**:
   - `df_merged`: After merging with another DataFrame.
   - `df_joined`: After joining with another DataFrame.
   - `df_sampled`: If a sample has been taken from the data.
   - `df_segmented`: If the data has been segmented (e.g., by customer type).

Each name corresponds to a common data processing or analysis task and makes it easier to track the purpose of each DataFrame in your workflow. Remember, these are just examples, and the actual names should align with the specific operations and logic of your project.