In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import pickle
import os
import boto3

In [None]:
# Initializing S3 access
key_id = os.environ.get("AWS_ACCESS_KEY_ID")
secret_key = os.environ.get("AWS_SECRET_ACCESS_KEY")
endpoint = os.environ.get("AWS_S3_ENDPOINT")
bucket_name = os.environ.get("AWS_S3_BUCKET")
bucket_key = "processed_customer_churn_data"

session = boto3.session.Session(aws_access_key_id=key_id, aws_secret_access_key=secret_key)
s3 = session.resource(service_name='s3', endpoint_url=endpoint)

In [None]:
# Download the data from GitHub
url = (r'https://raw.githubusercontent.com/red-hat-data-services/telecom-customer-churn-airflow/main/include/data/WA_Fn-UseC_-Telco-Customer-Churn.csv')
telecom_cust = pd.read_csv(url)
telecom_cust.head()

In [None]:
# Converting Total Charges to a numerical data type.
telecom_cust.TotalCharges = pd.to_numeric(telecom_cust.TotalCharges, errors='coerce')
telecom_cust.isnull().sum()

In [None]:
#Removing missing values 
telecom_cust.dropna(inplace = True)
#Remove customer IDs from the data set
df2 = telecom_cust.iloc[:,1:]
#Convertin the predictor variable in a binary numeric variable
df2['Churn'].replace(to_replace='Yes', value=1, inplace=True)
df2['Churn'].replace(to_replace='No',  value=0, inplace=True)

#Let's convert all the categorical variables into dummy variables
df_dummies = pd.get_dummies(df2)
df_dummies.head()

In [None]:
y = df_dummies['Churn'].values
X = df_dummies.drop(columns = ['Churn'])

# Scaling all the variables to a range of 0 to 1
features = X.columns.values
scaler = MinMaxScaler(feature_range = (0,1))
scaler.fit(X)
X = pd.DataFrame(scaler.transform(X))
X.columns = features

In [None]:
# Split the data into train and test partitions
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
# Upload the data to S3
processed_data = (X_train, X_test, y_train, y_test)
pickled_data = pickle.dumps(processed_data, protocol=pickle.HIGHEST_PROTOCOL)
s3.Bucket(bucket_name).put_object(Key=bucket_key, Body=pickled_data)