In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

df = pd.read_csv(r"C:\Users\Rizvan\Documents\latihan\latihan\cuk.csv")

# Data Wrangling

1. Basic

In [None]:
# Create DataFrame 
dataframe = pd.DataFrame()

# Add columns 
dataframe['Name'] = ['Jacky Jackson', 'Steven Stevenson']
dataframe['Age'] = [38, 25]
dataframe['Driver'] = [True, False]

# Create row 
new_person = pd.Series(['Molly Mooney', 40, True], index=['Name','Age','Driver'])

# Append row
dataframe.append(new_person, ignore_index=True)

#filter row
dataframe[(dataframe['Sex'] == 'female') & (dataframe['Age'] >= 65)]

# create column names
import collections
column_names = collections.defaultdict(str)
# Create keys
for name in df.columns:
    column_names[name]
    
# Calculate statistics 
print('Maximum:', dataframe['Age'].max())
print('Minimum:', dataframe['Age'].min())
print('Mean:', dataframe['Age'].mean())
print('Sum:', dataframe['Age'].sum())
print('Count:', dataframe['Age'].count())

#Show counts 
dataframe.count()

#Show value counts 
dataframe['PClass'].value_counts()

# Reset Index, Convert Data types
df = df.reset_index(drop = True)
df_cvt = df.convert_objects(convert_numeric=True)
df_cvt['Powerbin'] = df_cvt['Powerbin'].astype("category")

li = df[df_cvt['Power'].isnull() == True].index
df_cvt.isnull().sum()

2. Drop columns / rows

In [None]:
#Drop columns with name
dataframe.drop(['Age', 'Sex'], axis=1).head(2)

#Drop columns without names
dataframe.drop(dataframe.columns[1], axis=1).head(2)

# Delete row, show first two rows of output 
dataframe[dataframe.index != 0].head(2)

# Delete rows, show first two rows of output 
dataframe[dataframe['Sex'] != 'male'].head(2)

# drop the first two rows
df.drop([0, 1], axis=0)

# Drop duplicates, show first two rows 
dataframe.drop_duplicates().head(2)

# Drop duplicates
dataframe.drop_duplicates(subset=['Sex'], keep='last')

3. Group, Functions, Merge

In [None]:
#Groupby, calculate mean
dataframe.groupby(['Sex','Survived'])['Age'].mean()

#Group rows by week, calculate sum per week 
dataframe.resample('W').sum()
dataframe.resample('2W').mean()
dataframe.resample('M').count()
dataframe.resample('M', label='left').count()

#Looping over a column
for name in dataframe['Name'][0:2]:
    print(name.upper())
    
# Apply function, show two rows
def uppercase(x):
    return x.upper()
dataframe['Name'].apply(uppercase)[0:2]

# Group rows, apply function to groups 
dataframe.groupby('Sex').apply(lambda x: x.count())

# Concatenate DataFrames by rows 
pd.concat([dataframe_a, dataframe_b], axis=0)
# Concatenate DataFrames by columns 
pd.concat([dataframe_a, dataframe_b], axis=1)

# Merge DataFrames 
pd.merge(dataframe_employees, dataframe_sales, on='employee_id', how= 'outer') #how : left atau right

4. Data Describing

In [3]:
df.head() #data atas
df.tail() #data terbawah
df.describe() #statistika deskriptif
df.shape #ukuran data

(259347, 31)

5. Replace and Renaming

In [4]:
#load dataframe
dataframe = pd.read_csv('https://tinyurl.com/titanic-csv')

#data replacing
dataframe['Sex'].replace("female", "Woman").head(2) #replace 1 value
dataframe['Sex'].replace(["female", "male"], ["Woman", "Man"]).head(5) #replace multiple values
dataframe.replace(1, "One").head(2) #global replace
dataframe.replace(r"1st", "First", regex=True).head(2) #regex

#data renaming
dataframe.rename(columns={'PClass': 'Passenger Class', 'Sex': 'Gender'}).head(2)

Unnamed: 0,Name,Passenger Class,Age,Gender,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1


# FUNGSI

In [58]:
def imputting_median(df):
    for column in df:
        df[column].fillna(df[column].median(), inplace = True)

# Handling Numerical Data

In [114]:
from sklearn import preprocessing
#minmax scaling
def minmax_scaling(feature):
    minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
    scaled_feature = minmax_scale.fit_transform(feature)
    return feature

# Standard Scaler , mean = 0, var = 1 
scaler = preprocessing.StandardScaler()
standardized = scaler.fit_transform(x)

# Robust Scaler
robust_scaler = preprocessing.RobustScaler()
robust = robust_scaler.fit_transform(x)

# Normalizer, l2 = euclidean , l1 = manhattan
normalizer = Normalizer(norm="l2")
normalized = normalizer.transform(features)

# Custom transformation 
def add_ten(x):
    return x + 10

ten_transformer = FunctionTransformer(add_ten)
transformed = ten_transformer.transform(features)
df.apply(add_ten)

# Outlier Detector
outlier_detector = EllipticEnvelope(contamination=.1)
outlier_detector.fit(features)
outlier_detector.predict(features)

def indicies_of_outliers(x):
    q1, q3 = np.percentile(x, [25, 75])
    iqr = q3 - q1
    lower_bound = q1 - (iqr * 1.5)
    upper_bound = q3 + (iqr * 1.5)
    return np.where((x > upper_bound) | (x < lower_bound))
# Run function 
indicies_of_outliers(feature)

# Clustering K means
from sklearn.cluster import KMeans
clusterer = KMeans(3, random_state=0)
clusterer.fit(features)
dataframe["group"] = clusterer.predict(features)


#Binning
from sklearn.preprocessing import Binarizer
df_cvt['Power'] = df_cvt['Power'].bfill()
binarizer = Binarizer(df_cvt['Power'].mean())
df_cvt['Powerbin'] = binarizer.fit_transform([df_cvt['Power']]).T
np.digitize(age, bins=[20,30,64], right=True)

# KNN
from fancyimpute import KNN
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler() 
standardized_features = scaler.fit_transform(features)
features_knn_imputed = KNN(k=5, verbose=0).complete(standardized_features)

# Load library 
from sklearn.preprocessing import Imputer
# Create imputer 
mean_imputer = Imputer(strategy="mean", axis=0)
features_mean_imputed = mean_imputer.fit_transform(features)


# Remove observations with missing values
dataframe.dropna()



# Handling Categorical

In [204]:
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer
# Create dummy variables from feature 
pd.get_dummies(feature[:,0])

# Create one-hot encoder 
one_hot = LabelBinarizer()
one_hot.fit_transform(feature)
one_hot.inverse_transform(one_hot.transform(feature)) #reverse one hot encoding

# multiclass one hot encoder
one_hot_multiclass = MultiLabelBinarizer()
one_hot_multiclass.fit_transform(multiclass_feature)
one_hot_multiclass.classes_


#Scale mapper
scale_mapper = {"Low":1,
                "Medium":2, 
                "High":3}
dataframe["Score"].replace(scale_mapper)

# Imputting with KNN
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(3, weights='distance')
trained_model = clf.fit(X[:,1:], X[:,0])
imputed_values = trained_model.predict(X_with_nan[:,1:])
X_with_imputed = np.hstack((imputed_values.reshape(-1,1), X_with_nan[:,1:]))
np.vstack((X_with_imputed, X))


#Imputting with frequent
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy='most_frequent', axis=0)
imputer.fit_transform(X_complete)
                      
#Imbalanced Dataset
RandomForestClassifier(class_weight="balanced")
i_class0 = np.where(target == 0)[0] 
i_class1 = np.where(target == 1)[0]
n_class0 = len(i_class0) 
n_class1 = len(i_class1)
#downsample
i_class1_downsampled = np.random.choice(i_class1, size=n_class0, replace=False)
np.hstack((target[i_class0], target[i_class1_downsampled]))
np.vstack((features[i_class0,:], features[i_class1_downsampled,:]))[0:5]
#upsample
i_class0_upsampled = np.random.choice(i_class0, size=n_class1, replace=True)
np.concatenate((target[i_class0_upsampled], target[i_class1]))
np.vstack((features[i_class0_upsampled,:], features[i_class1,:]))[0:5]

# Handling Datetime

In [163]:
df = pd.read_csv(r"C:\Users\Rizvan\Documents\ml\Liquid Air\Train.csv")
df_converted = df.convert_objects(convert_numeric=True)
#col names
col = 'TimeStamp'

#convert string to datetime
df_time = [pd.to_datetime(date, format='%m/%d/%Y %H:%M') for date in df[col]]
df['date'] = pd.DataFrame(df_time)

# Set index 
df = df.set_index(df['date'])

# Select observations between two datetimes 
df.loc['2025-1-1 01:00:00':'2025-1-1 04:00:00']

# Datetime Feature Engingeering
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['hour'] = df['date'].dt.hour
df['minute'] = df['date'].dt.minute
df['day_of_week'] = df['date'].dt.weekday
df['day_names'] = df['date'].dt.weekday_name

# lagged values
lag = 1
df["Power_lag1"] = df["Power"].shift(lag)

# Handling Time Series Missing Values
col = 'Liquid Argon'
#df = df.interpolate(method = "quadratic")
#df = df.bfill()
#df = df.ffill()

For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  


# Model Evaluating

In [None]:
from sklearn import metrics 
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score 
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression 
from sklearn.preprocessing import StandardScaler

#CV Model
standardizer = StandardScaler() 
logit = LogisticRegression()
pipeline = make_pipeline(standardizer, logit)
kf = KFold(n_splits=10, shuffle=True, random_state=1)
cv_results = cross_val_score(pipeline, # Pipeline
                             features, # Feature matrix 
                             target, # Target vector    
                             cv=kf, # Cross-validation technique
                             scoring="accuracy", # Loss function #precision, recall, f1, neg_mean_squared_error, r2
                             n_jobs=-1) # Use all CPU scores
cv_results.mean()

#Train Test Split
from sklearn.model_selection import train_test_split
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.1, random_state=1)
# Fit standardizer to training set
standardizer = StandardScaler() 
standardizer.fit(features_train)
# Apply to both training and test sets 
features_train_std = standardizer.transform(features_train) 
features_test_std = standardizer.transform(features_test)


#Single Evaluation
from sklearn.metrics import accuracy_score
y_hat = logit.fit(X_train, y_train).predict(X_test)
accuracy_score(y_test, y_hat)

#Clustering evaluation
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
model = KMeans(n_clusters=2, random_state=1).fit(features)
target_predicted = model.labels_
silhouette_score(features, target_predicted)



# MODELLING

In [1]:
import warnings
warnings.filterwarnings('ignore')

Neural Network

In [2]:
from keras import models 
from keras import layers

ModuleNotFoundError: No module named 'keras'

In [56]:
df.head()
listnum = ['courier_count', 'view_count', 'seller_delivery_response_time', 'varSeller_6', 'varSeller_9', 'varProduct_2', 'varProduct_5']
df[list(df.columns)] = df[list(df.columns)].astype('category')
df[listnum] = df[listnum].astype('float')
df.dtypes


y = df['Decision']
X = df.drop(['Decision'], axis = 1)

In [35]:
from sklearn import svm
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.model_selection import cross_val_score


mdl_xgb = XGBClassifier()
mdl_svm = svm.SVC()
mdl_gnb = LogisticRegression()

#print('Score ROC_AUC Cross Validation :'); print(cross_val_score(mdl_svm, X, y, scoring="roc_auc", cv = 5),'\n')

mean_gnb_score = cross_val_score(mdl_gnb, X, y, scoring="roc_auc", cv = 5).mean()
acc_gnb_score = cross_val_score(mdl_gnb, X, y, scoring="accuracy", cv = 5).mean()
std_gnb_score = cross_val_score(mdl_gnb, X, y, scoring="precision", cv = 5).mean()
f1_gnb_score = cross_val_score(mdl_gnb, X, y, scoring="f1", cv = 5).mean()

std_score = cross_val_score(mdl_gnb, X, y, scoring="roc_auc", cv = 5).std()
print('Score Rata-rata AUC : '); print(mean_xgb_score, '\n')
print('Score Rata-rata Accuracy :'); print(acc_xgb_score, '\n')
print('Score Rata-rata Precision :'); print(std_xgb_score, '\n')
print('Score Rata-rata f1 Score :'); print(f1_xgb_score, '\n')

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Score Rata-rata AUC : 
0.6660650917351713 

Score Rata-rata Accuracy :
0.9847820470665809 

Score Rata-rata Precision :
0.0 

Score Rata-rata f1 Score :
0.0 



In [52]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
features_standardized = scaler.fit_transform(X)
logistic_regression = LogisticRegression(random_state=0, class_weight="balanced")
model = logistic_regression.fit(features_standardized, y)

mean_gnb_score = cross_val_score(model, X, y, scoring="roc_auc", cv = 5).mean()
acc_gnb_score = cross_val_score(model, X, y, scoring="accuracy", cv = 5).mean()
std_gnb_score = cross_val_score(model, X, y, scoring="precision", cv = 5).mean()
f1_gnb_scorec = cross_val_score(model, X, y, scoring="f1", cv = 5).mean()



In [53]:
print('Score Rata-rata AUC : '); print(mean_gnb_score, '\n')
print('Score Rata-rata Accuracy :'); print(acc_gnb_score, '\n')
print('Score Rata-rata Precision :'); print(std_gnb_score, '\n')
print('Score Rata-rata f1 Score :'); print(f1_gnb_score, '\n')

Score Rata-rata AUC : 
0.6594811342572795 

Score Rata-rata Accuracy :
0.5933150200713729 

Score Rata-rata Precision :
0.023908034493593376 

Score Rata-rata f1 Score :
0.0 



In [69]:
from sklearn.model_selection import train_test_split 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
features_train, features_test, target_train, target_test = train_test_split(X, y, random_state=1)

logistic_regression = LogisticRegression(random_state=0, class_weight="balanced", solver = "sag")
target_predicted = logistic_regression.fit(features_train, target_train).predict(features_test)
confusion_matrix(target_test, target_predicted)

print(classification_report(target_test,                            
                            target_predicted))



              precision    recall  f1-score   support

           0       0.99      0.30      0.46     63840
           1       0.02      0.82      0.04       985

    accuracy                           0.31     64825
   macro avg       0.50      0.56      0.25     64825
weighted avg       0.98      0.31      0.46     64825



0    63840
1      985
Name: Decision, dtype: int64