Load Data

In [1]:
import pandas as pd

# Specify the path to your Excel file
excel_file_path = 'data.xlsx'
new_excel_file_path = 'Aplicanti_Compensatii_Locatari.xlsx'

# Read the Excel file into a pandas DataFrame
df_original = pd.read_excel(new_excel_file_path)

Pre-Proccessing 

In [2]:
from datetime import datetime# Get the current year

df_copy = df_original.copy(deep=True)

# Remove consum
df_copy.drop(["Consum volum 11.2022","Consum volum 12.2022","Consum volum 01.2023","Consum volum 02.2023","Consum volum 03.2023"],axis=1, inplace=True)

# Remove all NaN
df_copy = df_copy.dropna()

print(df_copy)

# Sex 0 -> 1
df_copy['Sex'] = df_copy['Sex'].replace(0, 1)

# Age ranging 
current_year = datetime.now().year
df_copy['Age'] = current_year - df_copy['DateOfBirth']
age_ranges = [0, 20, 40, 60, 80, 100]
labels = ['0-20', '20-40', '40-60', '60-80', '80-100']
df_copy['AgeRange'] = pd.cut(df_copy['Age'], bins=age_ranges, labels=labels, include_lowest=True,ordered=False)
df_copy['AgeRange'] = df_copy['AgeRange'].astype(str)

# Salary ranging
salary_ranges = [0, 10000,20000, 40000, 60000, 80000, 100000]
labels = ['0-10k','10-20k', '20k-40k', '40k-60k', '60k-80k', '80k-100k']
df_copy['SalaryRange'] = pd.cut(df_copy['AverageIncome'], bins=salary_ranges, labels=labels, include_lowest=True,ordered=False)
df_copy['SalaryRange'] = df_copy['SalaryRange'].astype(str)

                                          Id  Grad         Raion  \
0       0000592c-2e90-4a36-9e39-8aabd287aa93     4  mun.Chişinău   
2       000088d5-b06f-44df-a20e-a7d13bc84028     4  mun.Chişinău   
4       00010c3f-1305-4031-869a-580b39fc2a75     3  mun.Chişinău   
6       00013c26-465e-4ca1-9b84-c6859e2b634d     4  mun.Chişinău   
9       00020e46-9b16-4fdd-a437-d59ffccf2844     0  mun.Chişinău   
...                                      ...   ...           ...   
251571  a51f9aa6-ee7e-41ff-8d14-b7137af29f7a     4  mun.Chişinău   
251573  a51fb7f4-a5ba-47b6-b7ca-f3afeff20223     2  mun.Chişinău   
251577  a5201656-da31-4d70-bb72-3f2381f1524b     4  mun.Chişinău   
251579  a5202710-bf59-40bf-964d-48947a7b1088     4  mun.Chişinău   
251581  a5202eaa-84b7-42bf-8f32-f81188248a7d     4  mun.Chişinău   

          Localitate             Strada Tip incalzire principal  DateOfBirth  \
0         s.Bubuieci          M.Frunza            Gaze naturale       1949.0   
2           s.Băcioi   

Tokenization

In [3]:
from sklearn.preprocessing import LabelEncoder

salary_range_label_encoder = LabelEncoder()
age_range_label_encoder = LabelEncoder()
localitate_range_label_encoder = LabelEncoder()
tip_incalzire_principal_label_encoder = LabelEncoder()
company_name_label_encoder = LabelEncoder()

# Fit and transform the 'Category' column
df_copy['SalaryRange_encoded'] = salary_range_label_encoder.fit_transform(df_copy['SalaryRange'])
df_copy['AgeRange_encoded'] = age_range_label_encoder.fit_transform(df_copy['AgeRange'])
df_copy['Localitate_encoded'] = localitate_range_label_encoder.fit_transform(df_copy['Localitate'])
df_copy['Tip_incalzire_principal_encoded'] = tip_incalzire_principal_label_encoder.fit_transform(df_copy['Tip incalzire principal'])
df_copy['Company_name_encoded'] = company_name_label_encoder.fit_transform(df_copy['Name'])



print(df_copy["Localitate_encoded"])
print(df_copy["Tip_incalzire_principal_encoded"])

0          8
2         12
4          0
6          0
9          0
          ..
251571     0
251573     0
251577     0
251579    31
251581     0
Name: Localitate_encoded, Length: 123630, dtype: int64
0         2
2         2
4         3
6         3
9         2
         ..
251571    3
251573    3
251577    3
251579    2
251581    3
Name: Tip_incalzire_principal_encoded, Length: 123630, dtype: int64


Anomaly detection with KMeans 

In [None]:
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

features = df_copy[['Grad','Sex','SalaryRange_encoded', 'AgeRange_encoded','Localitate_encoded','Tip_incalzire_principal_encoded','Company_name_encoded']]

# Standardize the feature columns
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)
print(len(features_scaled))
 
# Perform K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
df_copy['Cluster'] = kmeans.fit_predict(features_scaled)

# Visualize the clustering (assuming 2D features for simplicity)
# plt.scatter(features_scaled[:, 0], features_scaled[:, 2], c=df_copy['Cluster'], cmap='viridis')
# plt.title('K-Means Clustering')
# plt.xlabel('Feature1')
# plt.ylabel('Feature2')
# plt.show()


for cluster_id in range(5):
    cluster_data = features_scaled[df_copy['Cluster'] == cluster_id]
    
    isolation_forest = IsolationForest(contamination=0.05, random_state=42)
    df_copy.loc[df_copy['Cluster'] == cluster_id, 'IsOutlier'] = isolation_forest.fit_predict(cluster_data)

# Visualize anomalies
anomalies = df_copy[df_copy['IsOutlier'] == -1]
# plt.scatter(features_scaled[:, 0], features_scaled[:, 2], c=df_copy['Cluster'], cmap='viridis', label='Normal')
# plt.scatter(anomalies['Grad'], anomalies['SalaryRange_encoded'], color='red', label='Anomaly')
# plt.title('Anomaly Detection')
# plt.xlabel('Feature1')
# plt.ylabel('Feature2')
# plt.legend()
# plt.show()


normal_data = df_copy[df_copy['IsOutlier'] == 1]
anomaly_data = df_copy[df_copy['IsOutlier'] == -1]

# Plot a scatter plot for Vulnerability vs Feature1 for anomalies
# plt.scatter(anomaly_data['Localitate_encoded'], anomaly_data['SalaryRange_encoded'], c='red', label='Anomalies')
# plt.title('Vulnerability vs Feature1 for Anomalies')
# plt.xlabel('Location')
# plt.ylabel('Salary')
# plt.legend()
# plt.show()


# Plot a scatter plot for Feature1 vs Feature2 with color-coded clusters
plt.scatter(df_copy['SalaryRange_encoded'], df_copy['AgeRange_encoded'], c=df_copy['Cluster'], cmap='viridis', edgecolor='k')
plt.title('Feature1 vs Feature2 with K-Means Clusters')
plt.xlabel('Salary')
plt.ylabel('Age')
plt.colorbar(label='Cluster')
plt.show()


print(len(normal_data))
print(len(anomaly_data))

# Filter data for anomalies
# Inspect decision function scores
anomaly_scores = isolation_forest.fit_predict(features_scaled)
print(anomaly_scores)
# Identify top contributing features for anomalies
top_contributing_features = features.columns[anomaly_scores.argsort()[:5]]  # Adjust the number of top features as needed

print("Top Contributing Features for Anomalies:")
print(top_contributing_features)

In [25]:
df_copy_consum[["Consum volum 11.2022","Consum volum 12.2022","Consum volum 01.2023","Consum volum 02.2023","Consum volum 03.2023"]]

Unnamed: 0,Consum volum 11.2022,Consum volum 12.2022,Consum volum 01.2023,Consum volum 02.2023,Consum volum 03.2023
0,358.0,416.0,387.0,327.0,348.0
2,138.0,149.0,151.0,135.0,143.0
4,103.0,100.0,103.0,100.0,93.0
6,28.0,36.0,42.0,36.0,25.0
9,84.0,85.0,89.0,85.0,79.0
...,...,...,...,...,...
251571,33.0,36.0,37.0,33.0,37.0
251573,44.0,48.0,52.0,61.0,67.0
251577,31.0,27.0,36.0,32.0,25.0
251579,69.0,74.0,79.0,76.0,68.0


Anomaly detection using Grad as Clusters

In [6]:
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

features = df_copy[["Grad",'Sex','SalaryRange_encoded', 'AgeRange_encoded','Localitate_encoded','Tip_incalzire_principal_encoded','Company_name_encoded','NrMembriDeFamilie']]

# Standardize the feature columns
# scaler = StandardScaler()
# features_scaled = scaler.fit_transform(features)
# print(len(features_scaled))

scaler = MinMaxScaler(feature_range=(-1, 1))
features_normalized = scaler.fit_transform(features)

for cluster_id in [0,1,2,3,4]:
    cluster_data = features_normalized[df_copy['Grad'] == cluster_id]
    
    isolation_forest = IsolationForest(contamination=0.10, random_state=42)
    df_copy.loc[df_copy['Grad'] == cluster_id, 'IsOutlier'] = isolation_forest.fit_predict(cluster_data)

print(df_copy['IsOutlier'])
df_copy = pd.merge(df_copy, df_copy_consum[["Consum volum 11.2022","Consum volum 12.2022","Consum volum 01.2023","Consum volum 02.2023","Consum volum 03.2023"]], on='Id', how='right')
df_copy.to_csv("New_data.csv", encoding='utf-8', index=False)

0         1.0
2         1.0
4         1.0
6         1.0
9         1.0
         ... 
251571    1.0
251573    1.0
251577    1.0
251579   -1.0
251581    1.0
Name: IsOutlier, Length: 123630, dtype: float64


MergeError: No common columns to perform merge on. Merge options: left_on=None, right_on=None, left_index=False, right_index=False

In [51]:
print(df_copy[df_copy['IsOutlier'] == -1])


                                          Id  Grad         Raion  \
32      000474fe-0d03-46ac-b9fe-218b539d2a85     4  mun.Chişinău   
43      0005c982-6dcb-4950-8f9b-c908db79fa85     4  mun.Chişinău   
58      00071ceb-57ec-4f51-97ec-25b6700b42d4     3  mun.Chişinău   
64      0008952a-05ed-42a9-8a32-cd83412d5da2     2  mun.Chişinău   
80      000a4f2d-8a87-45a4-810b-be1680a5dfa8     3  mun.Chişinău   
...                                      ...   ...           ...   
251502  a51596f7-f7e4-4bb5-b227-643a7584bf37     4  mun.Chişinău   
251513  a5181f99-358b-4fd1-bbed-46572b9f5d66     4  mun.Chişinău   
251564  a51ec2e1-7ee9-40cf-a869-c9cab2805f0b     3  mun.Chişinău   
251568  a51f600f-74be-4ced-b4a6-715a5e74606c     4  mun.Chişinău   
251579  a5202710-bf59-40bf-964d-48947a7b1088     4  mun.Chişinău   

          Localitate                Strada Tip incalzire principal  \
32         s.Truşeni             Basarabia       Combustibil solid   
43      mun.Chişinău          Malina Mica  

Anomaly Classification

In [17]:
# Import train_test_split function
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics
import pickle


features = df_copy[['Grad','Sex','SalaryRange_encoded', 'AgeRange_encoded','Localitate_encoded','Tip_incalzire_principal_encoded','Company_name_encoded','NrMembriDeFamilie']]
target = df_copy[["IsOutlier"]]

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3,random_state=109) # 70% training and 30% test

#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, y_pred))

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, y_pred))

with open('svm_model.pkl', 'wb') as model_file:
    pickle.dump(clf, model_file)

  y = column_or_1d(y, warn=True)


Accuracy: 0.9639515759389576
Precision: 0.9705511395301089
Recall: 0.99003650290228


In [64]:
import pickle
def return_prediction(model_file, data_dict):
    # x = ['Grad','Sex','SalaryRange_encoded', 'AgeRange_encoded','Localitate_encoded','Tip_incalzire_principal_encoded','Company_name_encoded','NrMembriDeFamilie']
    row_to_predict = pd.DataFrame(columns=['Grad','Sex','SalaryRange_encoded', 'AgeRange_encoded','Localitate_encoded','Tip_incalzire_principal_encoded','Company_name_encoded','NrMembriDeFamilie'])
    row_to_predict = row_to_predict.append({'Grad':data_dict['Grad'],'Sex':data_dict['Sex'],'SalaryRange_encoded':data_dict["SalaryRange_encoded"], 'AgeRange_encoded':data_dict["AgeRange_encoded"],'Localitate_encoded':data_dict["Localitate_encoded"],'Tip_incalzire_principal_encoded':data_dict["Tip_incalzire_principal_encoded"],'Company_name_encoded':data_dict["Company_name_encoded"],'NrMembriDeFamilie':data_dict["NrMembriDeFamilie"]}, ignore_index=True)
    
    with open('svm_model.pkl', 'rb') as model_file:
        loaded_model = pickle.load(model_file)

    # Make predictions with the loaded model
    new_predictions = loaded_model.predict(row_to_predict)
    print(new_predictions)

data = {'Grad':4,'Sex':1,'SalaryRange_encoded':1, 'AgeRange_encoded':4,'Localitate_encoded':1,'Tip_incalzire_principal_encoded':1,'Company_name_encoded':1,'NrMembriDeFamilie':10}
model_file = "/Users/afanasichihaioglo/Desktop/ProtonEnergySolutions/Software/Hackathon/UNDP/svm_model.pkl"

return_prediction(model_file,data)

    

[-1.]


  row_to_predict = row_to_predict.append({'Grad':data_dict['Grad'],'Sex':data_dict['Sex'],'SalaryRange_encoded':data_dict["SalaryRange_encoded"], 'AgeRange_encoded':data_dict["AgeRange_encoded"],'Localitate_encoded':data_dict["Localitate_encoded"],'Tip_incalzire_principal_encoded':data_dict["Tip_incalzire_principal_encoded"],'Company_name_encoded':data_dict["Company_name_encoded"],'NrMembriDeFamilie':data_dict["NrMembriDeFamilie"]}, ignore_index=True)


In [47]:
# Select one row from X_test for prediction (change the index as needed)
# row_to_predict = X_test.iloc[[34512]]
# print(type((row_to_predict)))
# row_to_predict = pd.DataFrame([1,4, 0,1,1,1,2])
# row_to_predict = pd.DataFrame({'Sex':1,'SalaryRange_encoded':4, 'AgeRange_encoded':0,'Localitate_encoded':1,'Tip_incalzire_principal_encoded':1,'Company_name_encoded':1,'NrMembriDeFamilie':2})
import numpy as np
row_to_predict = pd.DataFrame(columns=['Grad','Sex','SalaryRange_encoded', 'AgeRange_encoded','Localitate_encoded','Tip_incalzire_principal_encoded','Company_name_encoded','NrMembriDeFamilie'])
row_to_predict = row_to_predict.append({'Grad':0,'Sex':1,'SalaryRange_encoded':0, 'AgeRange_encoded':4,'Localitate_encoded':1,'Tip_incalzire_principal_encoded':1,'Company_name_encoded':1,'NrMembriDeFamilie':5}, ignore_index=True)


# Predict the response for the selected row
prediction = clf.predict(row_to_predict)

# Get the absolute values of the coefficients
abs_coefficients = np.abs(clf.coef_)

# Create a dictionary mapping feature names to their absolute coefficients
feature_coefficients = dict(zip(features.columns, abs_coefficients[0]))

# Sort the features by their absolute coefficients in descending order
top_features = sorted(feature_coefficients.items(), key=lambda x: x[1], reverse=True)

# Print the top features
print("Top features and their absolute coefficients:")
for feature, coefficient in top_features:
    print(f"{feature}: {coefficient}")

# Print the prediction
print("Prediction:", prediction)


# # Predict the response for the selected row
# prediction = clf.predict(row_to_predict)

# # Get indices of support vectors
# support_vector_indices = clf.support_

# # Get the corresponding support vectors
# support_vectors = X_train.iloc[support_vector_indices]

# # Print the prediction
# # print("Prediction:", prediction)

# print(support_vectors)

# # Print the features and their values for the support vectors
# # print("Features and their values for support vectors:")
# # for index, row in support_vectors.iterrows():
# #     print(row)

# # You can also check the dual coefficients (dual_coef_) associated with the support vectors
# print("Dual coefficients of support vectors:", clf.dual_coef_[0])

# # Get the coefficients of the SVM model (weights)
# coefficients = clf.coef_[0]

# # Print the prediction and the features along with their coefficients
# print("Prediction:", prediction)
# print("Features and their contributions:")
# for feature, coefficient in zip(features.columns, coefficients):
#     print(f"{feature}: {coefficient}")

# # If you want to print the intercept term (bias)
# print("Intercept (Bias):", clf.intercept_[0])

Top features and their absolute coefficients:
SalaryRange_encoded: 1.1686931347221616
Tip_incalzire_principal_encoded: 0.9725490992489085
Grad: 0.4902585748095589
Sex: 0.4747998038625383
NrMembriDeFamilie: 0.3371777154316078
Company_name_encoded: 0.20789929232523718
Localitate_encoded: 0.15024242022673207
AgeRange_encoded: 0.09820137363976755
Prediction: [1.]


  row_to_predict = row_to_predict.append({'Grad':0,'Sex':1,'SalaryRange_encoded':0, 'AgeRange_encoded':4,'Localitate_encoded':1,'Tip_incalzire_principal_encoded':1,'Company_name_encoded':1,'NrMembriDeFamilie':5}, ignore_index=True)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Assuming df_copy is your DataFrame
features = df_copy[["Sex", "SalaryRange_encoded", "AgeRange_encoded", "Localitate_encoded", "Tip_incalzire_principal_encoded", "Company_name_encoded", "NrMembriDeFamilie"]]
target = df_copy["IsOutlier"]

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=109)  # 70% training and 30% test

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create a Logistic Regression model
logreg = LogisticRegression()

# Train the model using the training sets
logreg.fit(X_train_scaled, y_train)

# Predictions on the test set
y_pred = logreg.predict(X_test_scaled)

# Compute accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Compute precision
precision = precision_score(y_test, y_pred)
print("Precision:", precision)

# Compute recall
recall = recall_score(y_test, y_pred)
print("Recall:", recall)

# Compute F1 score
f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)



In [None]:
# Select one row from X_test for prediction (change the index as needed)
row_to_predict = X_test_scaled[[2000]]
row_to_predict = pd.DataFrame(columns=['Sex','SalaryRange_encoded', 'AgeRange_encoded','Localitate_encoded','Tip_incalzire_principal_encoded','Company_name_encoded','NrMembriDeFamilie'])
row_to_predict = row_to_predict.append({'Sex':1,'SalaryRange_encoded':1, 'AgeRange_encoded':4,'Localitate_encoded':1,'Tip_incalzire_principal_encoded':1,'Company_name_encoded':1,'NrMembriDeFamilie':1}, ignore_index=True)


# Predict the response for the selected row
prediction = logreg.predict(row_to_predict)

# Get the coefficients and intercept
coefficients = logreg.coef_[0]
intercept = logreg.intercept_[0]

# Print the coefficients and intercept
print("Intercept:", intercept)
print("Coefficients for each feature:")
for feature, coefficient in zip(features.columns, coefficients):
    print(f"{feature}: {coefficient}")

# Print the prediction
print("Prediction:", prediction)



NEURAL NETWORKS

In [8]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Assuming df_copy is your DataFrame
features = df_copy[["Sex", "SalaryRange_encoded", "AgeRange_encoded", "Localitate_encoded", "Tip_incalzire_principal_encoded", "Company_name_encoded", "NrMembriDeFamilie"]]
target = df_copy["IsOutlier"]

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=109)  # 70% training and 30% test

# Standardize the features
# nn_scaler = StandardScaler()
# X_train_scaled = nn_scaler.fit_transform(X_train)
# X_test_scaled = nn_scaler.transform(X_test)
nn_scaler = MinMaxScaler(feature_range=(-1, 1))
X_train_scaled = nn_scaler.fit_transform(X_train)
X_test_scaled = nn_scaler.transform(X_test)

# Create a neural network model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_scaled, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model on the test set
y_pred_proba = model.predict(X_test_scaled)
y_pred = (y_pred_proba >= 0.5).astype(int)

# Compute metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.821102491933823
Precision: 0.8844261752368062
Recall: 0.821102491933823
F1 Score: 0.8515887775995836


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Get the activations of neurons in the first layer for a specific sample (change the index as needed)
sample_index = 10
sample = X_test_scaled[sample_index].reshape(1, -1)
activations = model.layers[0](sample).numpy()

# Print the activations of neurons in the first layer
print("Activations of neurons in the first layer:")
print(activations)

In [12]:
# Convert the data point to a numpy array
import numpy as np
new_data_point = np.array([[1, 1, 4, 1, 1, 1, 1]])

# Standardize the features (assuming you used StandardScaler during training)
new_data_point_scaled = nn_scaler.transform(new_data_point)
print(new_data_point_scaled)

# Make a prediction
prediction_proba = model.predict(new_data_point_scaled)
prediction_class = (prediction_proba >= 0.5).astype(int)

print("Predicted Probability:", prediction_proba)
print("Predicted Class:", prediction_class)

[[-1.         -0.66666667  0.6        -0.94117647 -0.33333333 -0.71428571
  -0.83333333]]
Predicted Probability: [[1.]]
Predicted Class: [[1]]


