In [14]:
pip install awswrangler

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [15]:
import pandas as pd

In [16]:

import pandas as pd

def load_data(filepath):
    """Carga los datos desde un archivo CSV."""
    df = pd.read_csv(filepath)
    print(f"Shape of the data: {df.shape}")
    return df


In [17]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.utils import resample

def prepare_data(df, numeric_features, categorical_features, target_column):
    """Prepares and preprocesses the data by cleaning, feature engineering, and balancing."""
    
    # Fill missing values for 'RainToday' and 'RainTomorrow' (assuming binary classification with 'No' as the negative class)
    df['RainToday'].fillna('No', inplace=True)
    df['RainTomorrow'].fillna('No', inplace=True)
    
    # Convert 'RainToday' and 'RainTomorrow' to numeric
    df['RainToday'] = df['RainToday'].map({'No': 0, 'Yes': 1})
    df['RainTomorrow'] = df['RainTomorrow'].map({'No': 0, 'Yes': 1})
    
    # Create categorical feature based on rainfall
    if 'Location' in df.columns and 'Rainfall' in df.columns:
        wet_mean = df[['Location', 'Rainfall']].groupby('Location').mean().mean()[0]
        wet_cities_mean = df[['Location', 'Rainfall']].groupby('Location').mean()
        wet_cities_mean = wet_cities_mean[wet_cities_mean['Rainfall'] > wet_mean].index
        df['_Location_type'] = df['Location'].apply(lambda x: 'Wet Location' if x in wet_cities_mean else 'Dry Location')
    else:
        df['_Location_type'] = 'Unknown'  # Default if 'Location' or 'Rainfall' missing

    # Handle missing values
    imputer = ColumnTransformer(
        transformers=[
            ('num', SimpleImputer(strategy='mean'), numeric_features),
            ('cat', SimpleImputer(strategy='constant', fill_value='Unknown'), categorical_features)
        ])
    df[numeric_features + categorical_features] = imputer.fit_transform(df[numeric_features + categorical_features])

    # Date filtering and parsing
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)
        df = df[df['Date'] > '2013-01-01']
        df['_Month'] = df['Date'].dt.month
        df.drop('Date', axis=1, inplace=True)

    # Check and balance classes
    if target_column in df.columns:
        class_counts = df[target_column].value_counts()
        if class_counts.min() / class_counts.max() < 0.5:
            minority_class = class_counts.idxmin()
            df_minority = df[df[target_column] == minority_class]
            df_majority = df[df[target_column] == class_counts.idxmax()]

            df_minority_upsampled = resample(df_minority, replace=True, n_samples=class_counts.max(), random_state=123)
            df = pd.concat([df_majority, df_minority_upsampled])

    # Handling outliers in 'Rainfall' if it exists in numeric_features
    if 'Rainfall' in numeric_features:
        q1 = df['Rainfall'].quantile(0.25)
        q3 = df['Rainfall'].quantile(0.75)
        iqr = q3 - q1
        df = df[~((df['Rainfall'] > (q3 + 1.5 * iqr)) | (df['Rainfall'] < (q1 - 1.5 * iqr)))]

    return df

# Example usage:
# df_loaded = pd.read_csv('your_data.csv')
# numeric_features = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', '



In [18]:
import awswrangler as wr

In [19]:
import awswrangler as wr

import mlflow

# Para que funciones, todos nuestros scripts debemos exportar las siguientes variables de entorno
%env AWS_ACCESS_KEY_ID=minio   
%env AWS_SECRET_ACCESS_KEY=minio123 
%env MLFLOW_S3_ENDPOINT_URL=http://localhost:9000
%env AWS_ENDPOINT_URL_S3=http://localhost:9000

env: AWS_ACCESS_KEY_ID=minio
env: AWS_SECRET_ACCESS_KEY=minio123
env: MLFLOW_S3_ENDPOINT_URL=http://localhost:9000
env: AWS_ENDPOINT_URL_S3=http://localhost:9000


In [29]:
import awswrangler as wr

# Lee el archivo CSV desde S3, especificando que el delimitador es punto y coma
df_rain = wr.s3.read_csv("s3://data/data_info/weatherAUS.csv", sep=';')

# Muestra las dimensiones del DataFrame
print(df_rain.shape)

# Muestra las primeras filas del DataFrame
df_rain.head()


(145460, 23)


  df: pd.DataFrame = parser_func(f, **pandas_kwargs)


Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,1/12/2008,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2/12/2008,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,3/12/2008,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,4/12/2008,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,5/12/2008,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [30]:

data = df_rain
df_rain  = data.dropna()


In [31]:
import pandas as pd

# Definir las características numéricas, categóricas y la columna objetivo
numeric_features = ['Sunshine', 'Humidity9am', 'Humidity3pm', 'Cloud9am', 'Cloud3pm']
categorical_features = ['_Location_type']
target_column = 'RainTomorrow'

# Llamar a la función prepare_data con las características definidas
prepared_data = prepare_data(data, numeric_features, categorical_features, 'RainTomorrow')

print(prepared_data.head())



# Example usage:
# df_loaded = pd.read_csv('your_data.csv')
# numeric_features = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm']
# categorical_features = ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', '_Location_type']
## prepared_data = prepare_data(df_loaded, numeric_features, categorical_features, 'RainTomorrow')




The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['RainToday'].fillna('No', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['RainTomorrow'].fillna('No', inplace=True)
  wet_mean = df[['Location', 'Rainfall']].groupby('Location').mean().mean()[0]


     Location  MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine WindGustDir  \
1432   Albury     13.8     33.6       0.0          NaN  7.611178         SSE   
1433   Albury     15.8     36.9       0.0          NaN  7.611178           E   
1434   Albury     18.6     40.7       0.0          NaN  7.611178         ENE   
1436   Albury     20.9     42.0      12.6          NaN  7.611178         SSE   
1437   Albury     21.9     40.4       0.0          NaN  7.611178         NNW   

      WindGustSpeed WindDir9am WindDir3pm  ...  Pressure9am  Pressure3pm  \
1432           28.0        SSW        ESE  ...       1012.2       1010.9   
1433           22.0          S        SSW  ...       1015.8       1011.9   
1434           44.0        ESE          N  ...       1012.3       1008.1   
1436           39.0        ENE         SW  ...       1017.6       1014.3   
1437           41.0          E        ENE  ...       1018.2       1012.4   

      Cloud9am Cloud3pm  Temp9am Temp3pm RainToday RainTomorro

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['_Month'] = df['Date'].dt.month
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop('Date', axis=1, inplace=True)


In [32]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

def encode_and_normalize_data(df, numeric_features, categorical_features, target_column):
    """Encodes categorical features using One-Hot Encoding and normalizes numeric features."""
    
    # Define the preprocessing for both numeric and categorical data
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),  # Normalization for numeric features
            ('cat', OneHotEncoder(), categorical_features)  # One-hot encoding for categorical features
        ], remainder='passthrough')  # 'passthrough' for columns not listed

    # Separate the target from features
    X = df.drop(target_column, axis=1)
    y = df[target_column].map({'No': 0, 'Yes': 1})  # Convert target column to binary

    # Fit and transform the data using ColumnTransformer
    X_processed = preprocessor.fit_transform(X)

    # Generate new column names from one-hot encoding and existing numeric columns
    # Numeric feature names remain the same, get new names for encoded categorical features
    new_columns = numeric_features + list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features))
    
    # Convert the processed data back to a DataFrame for ease of use
    X_processed = pd.DataFrame(X_processed, columns=new_columns, index=df.index)

    return X_processed, y




In [33]:
prepared_data.head(10)

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,_Location_type,_Month
1432,Albury,13.8,33.6,0.0,,7.611178,SSE,28.0,SSW,ESE,...,1012.2,1010.9,4.447461,4.50993,21.4,31.2,0,0,Dry Location,1
1433,Albury,15.8,36.9,0.0,,7.611178,E,22.0,S,SSW,...,1015.8,1011.9,4.447461,4.50993,23.6,33.9,0,0,Dry Location,1
1434,Albury,18.6,40.7,0.0,,7.611178,ENE,44.0,ESE,N,...,1012.3,1008.1,4.447461,4.50993,27.8,39.3,0,0,Dry Location,1
1436,Albury,20.9,42.0,12.6,,7.611178,SSE,39.0,ENE,SW,...,1017.6,1014.3,4.447461,4.50993,29.2,40.7,1,0,Dry Location,1
1437,Albury,21.9,40.4,0.0,,7.611178,NNW,41.0,E,ENE,...,1018.2,1012.4,4.447461,4.50993,29.7,38.2,0,0,Dry Location,1
1438,Albury,21.9,39.2,0.0,,7.611178,WNW,78.0,NW,WSW,...,1002.4,997.2,4.447461,4.50993,34.5,34.9,0,0,Dry Location,1
1439,Albury,13.3,25.1,0.0,,7.611178,WSW,52.0,WSW,NW,...,1002.4,1002.1,4.447461,4.50993,17.0,23.9,0,0,Dry Location,1
1440,Albury,11.2,32.2,0.0,,7.611178,NNE,31.0,SE,SSE,...,1007.8,1006.0,4.447461,4.50993,20.1,29.5,0,0,Dry Location,1
1441,Albury,14.5,38.8,0.0,,7.611178,NNW,37.0,SE,NNW,...,1007.6,1004.1,4.447461,4.50993,23.0,36.4,0,0,Dry Location,1
1442,Albury,17.0,28.8,0.0,,7.611178,W,37.0,W,W,...,1007.0,1010.2,4.447461,4.50993,23.2,26.0,0,0,Dry Location,1


In [35]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

def encode_and_normalize_data(df, numeric_features, categorical_features, target_column):
    """Encodes categorical features using One-Hot Encoding and normalizes numeric features."""
    
    # Define the preprocessing for both numeric and categorical data
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),  # Normalization for numeric features
            ('cat', OneHotEncoder(), categorical_features)  # One-hot encoding for categorical features
        ], remainder='drop')  # Drop other columns that are not specified

    # Separate the target from features
    X = df[numeric_features + categorical_features]
    y = df[target_column].map({'No': 0, 'Yes': 1})  # Convert target column to binary

    # Fit and transform the data using ColumnTransformer
    X_processed = preprocessor.fit_transform(X)
    
    # Generate new column names from one-hot encoding and existing numeric columns
    # Numeric feature names remain the same, get new names for encoded categorical features
    new_columns = numeric_features + list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features))
    
    # Convert the processed data back to a DataFrame for ease of use
    X_processed = pd.DataFrame(X_processed, columns=new_columns, index=df.index)
    # Eliminar filas con valores nulos
    
    return X_processed, y




In [36]:
# Example usage:
# Assuming 'prepared_data' is your DataFrame loaded with data including the 'RainTomorrow' column as the target
numeric_features = ['Sunshine', 'Humidity9am', 'Humidity3pm', 'Cloud9am', 'Cloud3pm']
categorical_features = ['_Location_type']
processed_data, labels = encode_and_normalize_data(prepared_data, numeric_features, categorical_features, 'RainTomorrow')


In [37]:
processed_data

Unnamed: 0,Sunshine,Humidity9am,Humidity3pm,Cloud9am,Cloud3pm,_Location_type_Dry Location,_Location_type_Wet Location
1432,0.175511,-1.275566,-1.840775,-0.202174,-0.212521,1.0,0.0
1433,0.175511,-2.125579,-1.703344,-0.202174,-0.212521,1.0,0.0
1434,0.175511,-1.488070,-2.069828,-0.202174,-0.212521,1.0,0.0
1436,0.175511,-1.700573,-1.611723,-0.202174,-0.212521,1.0,0.0
1437,0.175511,-1.328692,-1.611723,-0.202174,-0.212521,1.0,0.0
...,...,...,...,...,...,...,...
32672,-0.054191,-0.159925,-0.008356,-1.774738,1.001876,0.0,1.0
73849,0.175511,-0.213051,-0.466461,-0.202174,-0.212521,1.0,0.0
142111,-0.242109,0.265081,0.449749,0.506019,1.001876,0.0,1.0
117339,-1.181695,0.265081,-0.008356,1.418321,1.489572,1.0,0.0


In [38]:
labels.head(10)

1432   NaN
1433   NaN
1434   NaN
1436   NaN
1437   NaN
1438   NaN
1439   NaN
1440   NaN
1441   NaN
1442   NaN
Name: RainTomorrow, dtype: float64