Shyju Kozhisseri<br/>ID: 309572<br/>Group: J41323c

## Import Data

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import KBinsDiscretizer
from copy import  copy
import networkx as nx
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
warnings.filterwarnings("ignore",category=UserWarning)

In [None]:
input = pd.read_csv('garments_worker_productivity.csv')

In [None]:
input.head()

In [None]:
input.describe()

In [None]:
input.isna().sum()

In [None]:
sub_sample = input[['team','smv', 'no_of_workers', 'wip', 'over_time', 'incentive', 'idle_time', 'idle_men', 'no_of_style_change', 'targeted_productivity', 'actual_productivity']]
sub_sample

In [None]:
sub_sample = sub_sample.fillna(sub_sample.mean())
sub_sample.head()

## Scale Data

In [None]:
sub_sample.columns

In [None]:
scaler = MinMaxScaler()
original_data = sub_sample.copy()
sub_sample[['smv', 'no_of_workers', 'wip', 'over_time', 'incentive', 'idle_time','idle_men', 'no_of_style_change', 'team']] = scaler.fit_transform(sub_sample.iloc[:,:-2])
sub_sample

In [None]:
sub_sample.describe()

## Correlation Coefficient

In [None]:
sub_sample.corr()

In [None]:
corrMatrix = sub_sample.corr()
plt.rcParams['figure.figsize'] = [20, 20]
sns.heatmap(corrMatrix, annot=True, cmap='Blues')
plt.show()

In [None]:
vif_data = pd.DataFrame()
vif_data["feature"] = sub_sample.columns

vif_data["VIF"] = [variance_inflation_factor(sub_sample.values, i)
                          for i in range(len(sub_sample.columns))]

vif_data

## Sampling

### Inverse Transform Sampling

In [None]:
from scipy.stats import ks_2samp
from functools import partial

def inv_transform_sampling(values, num_samples=100):

    n_bins = int(1 + (3.322 * np.log(values.shape[0])).round())
    hist, bin_edges = np.histogram(values, bins=n_bins, density=True)
    cum_values = np.zeros(bin_edges.shape)
    cum_values[1:] = np.cumsum(hist * np.diff(bin_edges))
    inversed_cdf = partial(np.interp, xp=cum_values, fp=bin_edges)
    uniform_values = np.random.rand(num_samples)
    
    return inversed_cdf(uniform_values), n_bins




for column in sub_sample:

    plt.figure(figsize=(8, 5))
    nsample, n_bins = inv_transform_sampling(sub_sample[column], num_samples=100)
    x = np.linspace(nsample.min(), nsample.max(), len(nsample))
    kernel = stats.gaussian_kde(sub_sample[column])
    kde_values = kernel(x)
    sns.histplot(nsample, stat='density', bins=10, kde=True, label=column+'_sample')
    plt.title(column)
    plt.plot(x, kde_values, 'r--', label='Original')
    plt.ylabel('p')
    plt.xlabel('sample')
    plt.legend()
    plt.show()
    print(ks_2samp(nsample, sub_sample[column]))

### Neiman Sampling

In [None]:
def geom_sampling(values, num_samples=100):

    n_bins = int(1 + (3.322 * np.log(values.shape[0])).round())
    hist, bin_edges = np.histogram(values, bins=n_bins, density=True)
    left, right = np.min(bin_edges), np.max(bin_edges)
    bottom, top = np.min(hist), np.max(hist)
    accept_samples = np.asarray([])
    
    while accept_samples.shape[0] < num_samples:
        x_sampl = np.random.uniform(left, right, size=(num_samples,))
        y_sampl = np.random.uniform(bottom, top, size=(num_samples,))
        x_indices = np.digitize(x_sampl, bin_edges) - 1
        y_mask = (y_sampl <= hist[x_indices])
        accept_samples = np.concatenate([accept_samples, x_sampl[y_mask]])
    
    return accept_samples[:num_samples], n_bins



for column in sub_sample:

    plt.figure(figsize=(8, 5))
    nsample, n_bins = geom_sampling(sub_sample[column], num_samples=100)
    x = np.linspace(nsample.min(), nsample.max(), len(nsample))
    kernel = stats.gaussian_kde(sub_sample[column])
    kde_values = kernel(x)
    sns.histplot(nsample, stat='density', bins=10, kde=True, label=column+'_sample')
    plt.title(column)
    plt.plot(x, kde_values, 'r--', label='Original')
    plt.ylabel('p')
    plt.xlabel('sample')
    plt.legend()
    plt.show()
    print(ks_2samp(nsample, sub_sample[column]))

### Cluster Sampling

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4, random_state=0).fit(sub_sample.iloc[:, -1].values.reshape(-1, 1))

In [None]:
new_sample = sub_sample.copy()
new_sample['labels'] = kmeans.labels_
new_sample.head()

In [None]:
def cluster_sampling(df, number_of_clusters, number_of_items):

    sample = pd.DataFrame(columns=df.columns)
    item_per_cluster = -(-number_of_items//number_of_clusters)

    for index in range(number_of_clusters):
        data = df.loc[df['labels']==index].sample(item_per_cluster)
        sample = sample.append(data)
    sample = sample.iloc[:number_of_items,:].sort_index()
    return sample


In [None]:
cluster_sample = cluster_sampling(new_sample, 4, 100)
cluster_sample.head()

In [None]:
cluster_sample.mean(), sub_sample.mean()

In [None]:
cluster_sample.var(), sub_sample.var()

In [None]:
for index, column in enumerate(sub_sample.columns):
    y = cluster_sample[column]
    y_all = sub_sample[column]
    x = np.linspace(y.min(), y.max(), len(y))

    if y.var() != 0:
      plt.figure(figsize=(8, 5))
      kernel = stats.gaussian_kde(y_all)
      kde_values = kernel(x)

      sns.histplot(y, kde=True, stat='density', label=column, bins=10)
      plt.plot(x, kde_values, 'r--', label='Original')
      plt.ylabel('p')
      plt.xlabel('sample')
      plt.legend()
      plt.show()
      print(ks_2samp(y, y_all))

## Relation Predictors vs Target

In [None]:
corrMatrix = sub_sample.corr()
sns.heatmap(corrMatrix, annot=True, cmap='Blues')
#plt.savefig('corr.png')

## Bayesian Network

In [None]:
from pgmpy.models import BayesianModel
from pgmpy.inference import VariableElimination
from pgmpy.estimators import MaximumLikelihoodEstimator, BayesianEstimator, BDeuScore, K2Score, BicScore, HillClimbSearch, TreeSearch
from pgmpy.sampling import BayesianModelSampling
from pgmpy.base import DAG

In [None]:
data_sample = cluster_sample.iloc[:, :-1]
data_sample.columns

In [None]:
transformed_data = copy(data_sample)
est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='kmeans')
data_discrete = est.fit_transform(data_sample[['team','no_of_workers', 'idle_men', 'no_of_style_change']])
transformed_data[['team','no_of_workers', 'idle_men', 'no_of_style_change']] = data_discrete

In [None]:
blacklist = [(x, y) for x in transformed_data.columns.to_list() for y in ['smv', 'wip'] if x != y]
blacklist

In [None]:
def sampling (bn: DAG, data: pd.DataFrame, n: int = 100):
    bn_new = BayesianModel(bn.edges())
    bn_new.fit(data)
    sampler = BayesianModelSampling(bn_new)
    sample = sampler.forward_sample(size=n, return_type='dataframe')
    return sample

## Basic Network

In [None]:
model = BayesianModel([("smv", "wip"),("incentive","over_time"),("over_time", "wip"), 
                       ("wip", "targeted_productivity"),("team","targeted_productivity"),
                       ("targeted_productivity", "actual_productivity")                       
                       ])

G_bm = nx.DiGraph()
G_bm.add_edges_from(model.edges())
pos = nx.layout.circular_layout(G_bm)
plt.figure(figsize=(8,5)) 
nx.draw(G_bm, pos, with_labels=True,font_weight='bold')

In [None]:
sample_bm = sampling(model, transformed_data, 100)
sample_bm.columns

In [None]:
sample_bm[['incentive', 'targeted_productivity', 'over_time', 'wip']] = est.inverse_transform(sample_bm[['incentive', 'targeted_productivity', 'over_time', 'wip']].values)

In [None]:
plt.figure(figsize=(8,5)) 
sns.histplot(data_sample['actual_productivity'], label='Original data', color='b')
sns.histplot(sample_bm['actual_productivity'], label='Generated data', color='orange')
plt.legend()

### Chow-Liu Algorithm

In [None]:
cb = TreeSearch(transformed_data, 'team')
best_model_new = cb.estimate(estimator_type='chow-liu')
G_cb = nx.DiGraph()
G_cb.add_edges_from(best_model_new.edges())
pos = nx.layout.circular_layout(G_cb)
plt.figure(figsize=(8,5)) 
nx.draw(G_cb, pos, with_labels=True,font_weight='bold')

In [None]:
sample_cb = sampling(best_model_new, transformed_data, 100)
sample_cb.columns

In [None]:
sample_cb[['incentive', 'targeted_productivity', 'over_time', 'smv']] = est.inverse_transform(sample_cb[['incentive', 'targeted_productivity', 'over_time', 'smv']].values)

In [None]:
plt.figure(figsize=(8,5)) 
sns.histplot(data_sample['actual_productivity'], label='Original data', color='b')
sns.histplot(sample_cb['actual_productivity'], label='Generated data', color='orange')
plt.legend()

### HillClimb Search Algorithm

In [None]:
hc = HillClimbSearch(transformed_data, scoring_method=K2Score(transformed_data))
best_model_new = hc.estimate(black_list=blacklist)
G_K2 = nx.DiGraph()
G_K2.add_edges_from(best_model_new.edges())
pos = nx.layout.circular_layout(G_K2)
plt.figure(figsize=(8,5)) 
nx.draw(G_K2, pos, with_labels=True,font_weight='bold')

In [None]:
sample_K2 = sampling(best_model_new, transformed_data, 100)
sample_K2.columns

In [None]:
sample_K2[['incentive', 'targeted_productivity', 'over_time', 'wip']] = est.inverse_transform(sample_K2[['incentive', 'targeted_productivity', 'over_time', 'wip']].values)

In [None]:
plt.figure(figsize=(8,5)) 
sns.histplot(data_sample['actual_productivity'], label='Original data', color='b')
sns.histplot(sample_K2['actual_productivity'], label='Generated data', color='orange')
plt.legend()

### Performance Metrics

In [None]:
from sklearn.metrics import explained_variance_score, r2_score, mean_squared_error, mean_absolute_error

In [None]:
plt.figure(figsize=(8, 5))
kernel = stats.gaussian_kde(data_sample['actual_productivity'])
min_amount, max_amount = data_sample['actual_productivity'].min(), data_sample['actual_productivity'].max()
x = np.linspace(min_amount, max_amount, len(data_sample['actual_productivity']))

kernel2 = stats.gaussian_kde(sample_bm['actual_productivity'])
kernel3 = stats.gaussian_kde(sample_K2['actual_productivity'])
kernel4 = stats.gaussian_kde(sample_cb['actual_productivity'])

kde_values = kernel(x)
kde_values_bm = kernel2(x)
kde_values_k2 = kernel3(x)
kde_values_cb = kernel4(x)

plt.plot(x, kde_values, label="Original Sample")
plt.plot(x, kde_values_bm, label="Generated Sample-SimpleBN")
plt.plot(x, kde_values_k2, label="Generated Sample-HCK2Score")
plt.plot(x, kde_values_cb, label="Generated Sample-ChowLiu")
plt.title('KDE')
plt.ylabel('p')
plt.xlabel('actual_productivity')
plt.legend()
plt.show()

In [None]:
y_true = np.sort(data_sample['actual_productivity'])
y_pred_2 = np.sort(sample_K2['actual_productivity'])
y_pred_3 = np.sort(sample_cb['actual_productivity'])
y_pred_4 = np.sort(sample_bm['actual_productivity'])

print("R2 Score (HC Approach-K2Score):",r2_score(y_true, y_pred_2))
print("R2 Score (Chow-Liu Approach):",r2_score(y_true, y_pred_3))
print("R2 Score (Simple BN):",r2_score(y_true, y_pred_4))

print("\n")
print("MSE (HC Approach-K2Score):",mean_squared_error(y_true, y_pred_2))
print("MSE (Chow-Liu Approach):",mean_squared_error(y_true, y_pred_3))
print("MSE (Simple BN):",mean_squared_error(y_true, y_pred_4))

print("\n")
print("MAE (HC Approach-K2Score):",mean_absolute_error(y_true, y_pred_2))
print("MAE (Chow-Liu Approach):",mean_absolute_error(y_true, y_pred_3))
print("MAE (Simple BN):",mean_absolute_error(y_true, y_pred_4))

In [None]:
print("HC-K2Score: ", ks_2samp(sample_K2['actual_productivity'], data_sample['actual_productivity']))
print("Chow-Liu: ", ks_2samp(sample_cb['actual_productivity'], data_sample['actual_productivity']))
print("Simple BN: ", ks_2samp(sample_bm['actual_productivity'], data_sample['actual_productivity']))
