In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [5]:
# Load in data
df_crowdfunding = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m13/lesson_1/datasets/crowdfunding-data.csv')
df_crowdfunding

Unnamed: 0,goal,pledged,backers_count,country,staff_pick,spotlight,category,days_active,outcome
0,100,0,0,3,0,0,0,17,0
1,1400,14560,158,0,0,1,1,27,1
2,108400,142523,1425,4,0,0,2,20,1
3,4200,2477,24,0,0,0,1,40,0
4,7600,5265,53,0,0,0,3,4,0
...,...,...,...,...,...,...,...,...,...
1124,17130,15894,847,2,0,0,5,6,0
1125,97329,80937,862,6,0,0,3,29,0
1126,53597,40388,58,0,0,0,9,46,0
1127,71588,18102,274,0,0,0,2,43,0


In [6]:
# Define features set
# Drop the target to create the X data
X = df_crowdfunding.copy()
X.drop("outcome", axis=1, inplace=True)
X.head()

Unnamed: 0,goal,pledged,backers_count,country,staff_pick,spotlight,category,days_active
0,100,0,0,3,0,0,0,17
1,1400,14560,158,0,0,1,1,27
2,108400,142523,1425,4,0,0,2,20
3,4200,2477,24,0,0,0,1,40
4,7600,5265,53,0,0,0,3,4


In [8]:
# Define target vector
y = df_crowdfunding["outcome"].values.reshape(-1, 1)
y[:5]

array([[0],
       [1],
       [1],
       [0],
       [0]])

In [14]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [15]:
X_train.shape, X_test.shape, X.shape

((790, 8), (339, 8), (1129, 8))

In [16]:
# Scaling the X data by using StandardScaler()
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_train_scaled

array([[ 1.87305275,  1.84955924,  2.07809989, ...,  1.84754102,
         2.48187153, -1.54599576],
       [-0.68775416, -0.68222266, -0.68365999, ...,  1.84754102,
         0.86296267, -1.36280442],
       [-0.75696516, -0.64084108, -0.64952908, ..., -0.54125997,
        -0.75594619,  0.83549163],
       ...,
       [-0.76631799, -0.62754513, -0.45043207, ...,  1.84754102,
        -0.57606743, -0.32472017],
       [ 2.76344288,  1.02980193,  0.78965785, ..., -0.54125997,
         2.48187153, -1.54599576],
       [-0.82056445, -0.57139231, -0.63056746, ..., -0.54125997,
         1.22272019,  0.46910896]])

In [17]:
X_train_scaled.shape

(790, 8)

In [18]:
# Transforming the test dataset based on the fit from the training dataset
X_test_scaled = scaler.transform(X_test)
X_test_scaled

array([[-0.68588359, -0.49141374, -0.53575936, ...,  1.84754102,
         0.32332638, -0.50791151],
       [-0.83739956, -0.74547525, -0.7083101 , ..., -0.54125997,
        -0.57606743, -0.14152884],
       [-0.81682332, -0.74792791, -0.71115434, ..., -0.54125997,
        -0.93582495, -1.4238682 ],
       ...,
       [-0.7775414 , -0.61594574, -0.53575936, ...,  1.84754102,
         0.14344762, -0.14152884],
       [-0.78689424, -0.64766425, -0.57178644, ..., -0.54125997,
        -0.93582495, -0.26365639],
       [-0.7083304 , -0.52244993, -0.5585133 , ..., -0.54125997,
        -0.57606743, -0.50791151]])

In [19]:
# Check the max and min of the scaled training and testing sets
print("Scaled data min/max (StandardScaler):")
print("Training data min:",X_train_scaled.min())
print("Training data max:",X_train_scaled.max())
print("Testing data min:",X_test_scaled.min())
print("Testing data max:",X_test_scaled.max())

Scaled data min/max (StandardScaler):
Training data min: -1.6681233141500866
Training data max: 5.349927344746071
Testing data min: -1.6681233141500866
Testing data max: 6.192771333225068


In [20]:
# Alternatively, scaling the data by using MinMaxScaler()
scaler = MinMaxScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_train_scaled

array([[0.73078855, 0.7140081 , 0.46128629, ..., 1.        , 0.86956522,
        0.03333333],
       [0.04319437, 0.02281149, 0.00655635, ..., 1.        , 0.47826087,
        0.08333333],
       [0.02461075, 0.03410899, 0.01217608, ..., 0.        , 0.08695652,
        0.68333333],
       ...,
       [0.02209945, 0.03773889, 0.04495785, ..., 1.        , 0.13043478,
        0.36666667],
       [0.96986439, 0.49020783, 0.24914143, ..., 0.        , 0.86956522,
        0.03333333],
       [0.0075339 , 0.05306905, 0.01529816, ..., 0.        , 0.56521739,
        0.58333333]])

In [21]:
# Transforming the test dataset based on the fit from the training dataset
X_test_scaled = scaler.transform(X_test)
X_test_scaled

array([[0.04369663, 0.07490384, 0.03090852, ..., 1.        , 0.34782609,
        0.31666667],
       [0.00301356, 0.00554303, 0.00249766, ..., 0.        , 0.13043478,
        0.41666667],
       [0.00853842, 0.00487343, 0.00202935, ..., 0.        , 0.04347826,
        0.06666667],
       ...,
       [0.01908589, 0.04090561, 0.03090852, ..., 1.        , 0.30434783,
        0.41666667],
       [0.01657459, 0.03224621, 0.02497658, ..., 0.        , 0.04347826,
        0.38333333],
       [0.03766951, 0.06643071, 0.02716204, ..., 0.        , 0.13043478,
        0.31666667]])

In [22]:
# Check the max and min of the scaled training and testing sets
print("Scaled data min/max (MinMaxScaler):")
print("Training data min:",X_train_scaled.min())
print("Training data max:",X_train_scaled.max())
print("Testing data min:",X_test_scaled.min())
print("Testing data max:",X_test_scaled.max())

Scaled data min/max (MinMaxScaler):
Training data min: 0.0
Training data max: 1.0
Testing data min: 0.0
Testing data max: 1.1387761473618483
