In [160]:
import pandas as pd
import numpy as np

In [161]:
#load dataset
path = "/content/Indicadores_municipales_sabana_DA.csv"
df = pd.read_csv(path, encoding='latin-1')

In [162]:
#drop empty collumns
df = df.dropna(axis=1, how='all')

#fill all the empty cells of the dataset with the mean of its column
df.fillna(df.mean(), inplace=True)

#if there is still empty cells, because of the categorical values, it will fill them with the mode of the collumn
for column in df.columns:
    if df[column].isnull().sum() > 0:
        moda = df[column].mode()[0]
        df[column].fillna(moda, inplace=True)

  df.fillna(df.mean(), inplace=True)


In [163]:
#count the number of empty cells
empty_cell = df.isnull().sum().sum()
print(f"Empty cells: {empty_cell}")

Empty cells: 0


In [164]:
#delete the unnecessary columns
columns_to_delete = ['ent', 'nom_ent', 'mun', 'clave_mun','nom_mun']
df = df.drop(columns=columns_to_delete)

#change the categorical values to numerical
mapping = {'Muy bajo': 1, 'Bajo': 2, 'Medio': 3, 'Alto': 4, 'Muy alto': 5}
columns_to_convert = ['gdo_rezsoc00', 'gdo_rezsoc05', 'gdo_rezsoc10']  #columns where the changes will be applied

for columna in columns_to_convert:
    df[columna] = df[columna].replace(mapping)

In [165]:
#see the changes
cathegorical_column = ['gdo_rezsoc00', 'gdo_rezsoc05', 'gdo_rezsoc10']

print(df[cathegorical_column].tail())

      gdo_rezsoc00  gdo_rezsoc05  gdo_rezsoc10
2451             3             2             2
2452             1             1             1
2453             1             1             1
2454             2             2             1
2455             4             2             1


In [166]:
#move the label column to the end of the dataframe
labels_y = 'gdo_rezsoc10'
df = df[[col for col in df if col != labels_y]
        + [labels_y]]

#print the dataframe
df

Unnamed: 0,pobtot_ajustada,pobreza,pobreza_e,pobreza_m,vul_car,vul_ing,npnv,ic_rezedu,ic_asalud,ic_segsoc,...,pobreza_cap_90,pobreza_cap_00,pobreza_cap_10,pobreza_patrim_90,pobreza_patrim_00,pobreza_patrim_10,gini_90,gini_00,gini_10,gdo_rezsoc10
0,794304,30.531104,2.264478,28.266627,27.983320,8.419106,33.066469,14.970553,24.034493,41.799885,...,20.4,12.7,18.474600,43.4,33.7,41.900398,0.473,0.425,0.422628,1
1,48592,67.111172,8.040704,59.070468,22.439389,5.557604,4.891835,21.222712,15.514032,78.003570,...,39.9,29.0,30.980801,64.2,48.9,59.175800,0.379,0.533,0.343879,1
2,53104,61.360527,7.241238,54.119289,29.428583,2.921336,6.289554,27.361207,20.812551,80.051980,...,39.5,33.1,28.259199,63.9,57.9,56.504902,0.414,0.465,0.386781,1
3,14101,52.800458,4.769001,48.031458,27.128568,7.709276,12.361698,20.889023,14.071657,65.831374,...,35.2,21.0,22.386101,59.7,40.1,51.164501,0.392,0.541,0.344984,1
4,101379,45.338512,6.084037,39.254475,26.262912,8.279864,20.118712,20.578144,16.567818,52.616992,...,36.6,22.6,22.139999,60.6,42.2,45.703899,0.391,0.469,0.458083,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2451,21016,74.848837,12.301183,62.547654,19.229856,3.177689,2.743618,27.350040,36.056322,76.550988,...,51.8,54.8,41.368999,73.5,70.9,70.859596,0.403,0.589,0.342037,2
2452,27385,65.450191,10.203506,55.246687,23.623556,5.007426,5.918827,29.914879,53.313420,74.542926,...,34.2,25.9,20.563601,57.8,44.1,46.659199,0.422,0.463,0.362527,1
2453,117528,29.541959,3.535624,26.006335,16.644262,8.828019,44.985759,11.936088,18.316528,32.666426,...,15.7,20.7,12.115300,36.6,41.8,32.302700,0.528,0.498,0.436339,1
2454,20456,78.374962,14.607016,63.767946,13.750759,4.440331,3.433948,26.649950,11.769479,83.235286,...,36.2,36.4,30.037100,60.5,54.7,57.394501,0.380,0.483,0.365307,1


#KNN without libraries

In [167]:
#this is to set a specific seed for reproducibility
np.random.seed(0)

#shuffle the data
df = df.sample(frac=1)

#divide the dataset in 80% for training and 20% for testing
train_size = int(0.8 * len(df))
train_set = df[:train_size]
test_set = df[train_size:]

#declare the labels and features
X_train = train_set.iloc[:, :-1].values
y_train = train_set.iloc[:, -1].values
X_test = test_set.iloc[:, :-1].values
y_test = test_set.iloc[:, -1].values

#obtain mean and standard deviation
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0)

#normalize the data
X_train = (X_train - mean) / std
X_test = (X_test - mean) / std

In [168]:
#define the knn algorithm
def knn(X_train, y_train, X_test, k):
    y_pred = []
    for test_point in X_test:
        distances = np.sqrt(np.sum((X_train - test_point)**2, axis=1))
        k_indices = np.argsort(distances)[:k]
        k_nearest_labels = y_train[k_indices]
        pred = np.bincount(k_nearest_labels).argmax()
        y_pred.append(pred)
    return np.array(y_pred)

In [169]:
#stablish the number of k neighbors
k = 25

#prediction to call the knn function
y_pred = knn(X_train, y_train, X_test, k)

#check the accuracy
accuracy = np.sum(y_pred == y_test) / len(y_test)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 83.13%


In [170]:
#select 5 random samples from the testing set
random_samples = np.random.randint(0, len(y_test), 5)

#get the predictions and actual labels
predictions = y_pred[random_samples]
real_labels = y_test[random_samples]

#print results
for i in range(5):
    print(f"Sample {i+1}:")
    print(f"    Prediction: {predictions[i]}")
    print(f"    Actual label: {real_labels[i]}")

Sample 1:
    Prediction: 1
    Actual label: 1
Sample 2:
    Prediction: 1
    Actual label: 1
Sample 3:
    Prediction: 4
    Actual label: 4
Sample 4:
    Prediction: 1
    Actual label: 1
Sample 5:
    Prediction: 2
    Actual label: 2


#KNN with libraries

In [171]:
!pip install numpy scikit-learn



In [172]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

In [173]:
#define features and labels
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

#divide the dataset in 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)

#standardize the features of the datasets to get a mean of 0 and a variance of 1
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [174]:
k = 25  #stablish the number of k neighbors
model = KNeighborsClassifier(n_neighbors=k)
model.fit(X_train, y_train)

#make predictions
y_pred = model.predict(X_test)

In [175]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 82.52%


In [176]:
#select 5 random samples from the testing set
random_samples = np.random.randint(0, len(y_test), 5)

#get the predictions and actual labels
predictions = y_pred[random_samples]
real_labels = y_test[random_samples]

#print results
for i in range(5):
    print(f"Sample {i+1}:")
    print(f"    Prediction: {predictions[i]}")
    print(f"    Actual label: {real_labels[i]}")

Sample 1:
    Prediction: 1
    Actual label: 1
Sample 2:
    Prediction: 4
    Actual label: 4
Sample 3:
    Prediction: 3
    Actual label: 3
Sample 4:
    Prediction: 1
    Actual label: 1
Sample 5:
    Prediction: 1
    Actual label: 1


#Perceptron without libraries

In [177]:
#this is to set a specific seed for reproducibility
np.random.seed(1)

#shuffle the data
df = df.sample(frac=1)

#divide the dataset: 80% for training and 20% for testing
train_size = int(0.8 * len(df))
test_set = df.iloc[train_size:]
train_set = df.iloc[:train_size]

X_train = train_set.iloc[:, :-1].values
y_train = train_set.iloc[:, -1].values
X_test = test_set.iloc[:, :-1].values
y_test = test_set.iloc[:, -1].values

#convert the labels to binary where the output is 1 if the class is 1 and 0 otherwise
y_binary = np.where(y_train == 1, 1, 0)
y_train_binary = y_binary
y_test_binary = np.where(y_test == 1, 1, 0)

#normalize the training data
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0)
X_train = (X_train - mean) / std

#normalize the testing data
X_test = (X_test - mean) / std


In [178]:
#define the activation function
def activation(x):
    return np.where(x >= 0, 1, 0)


In [179]:
#create the perceptron algorithm
def perceptron(X, y, learning_rate, epochs):
    weights = np.random.randn(X.shape[1]) * 0.01
    bias = 0

    for _ in range(epochs):
        for i in range(X.shape[0]):
            linear_output = np.dot(X[i], weights) + bias
            y_pred = activation(linear_output)
            update = learning_rate * (y[i] - y_pred)
            weights += update * X[i]
            bias += update

    return weights, bias


In [180]:
#define the parameters
learning_rate = 0.1
epochs = 1000

weights, bias = perceptron(X_train, y_train_binary, learning_rate, epochs)


In [181]:
#make predictions
linear_output_test = np.dot(X_test, weights) + bias
y_pred_test = activation(linear_output_test)

#calculate accuracy of the model
accuracy_test = np.sum(y_pred_test == y_test_binary) / len(y_test_binary)
print(f"Accuracy: {accuracy_test * 100:.2f}%")


Accuracy: 98.58%


In [182]:
#select 5 random samples from the testing set
random_samples = np.random.randint(0, len(y_test), 5)

#get the predictions and actual labels
predictions = y_pred_test[random_samples]
real_labels = y_test_binary[random_samples]

#print results
for i in range(5):
    print(f"Sample {i+1}:")
    print(f"   Prediction: {predictions[i]}")
    print(f"   Actual label: {real_labels[i]}")


Sample 1:
   Prediction: 1
   Actual label: 1
Sample 2:
   Prediction: 0
   Actual label: 0
Sample 3:
   Prediction: 1
   Actual label: 1
Sample 4:
   Prediction: 0
   Actual label: 0
Sample 5:
   Prediction: 1
   Actual label: 1


#Perceptron with libraries

In [183]:
#import the library
from sklearn.linear_model import Perceptron

In [184]:
#obtain the features and labels from the dataset
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

#convert labels to binary: 1 for class "1" and 0 for any other class
y_binary = np.where(y == 1, 1, 0)


In [185]:
#split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)

#normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [186]:
#train the model
model = Perceptron(max_iter=1000, eta0=0.1, random_state=42)
model.fit(X_train, y_train)


In [187]:
# Make predictions
y_pred = model.predict(X_test)

#calculate accuracy of the model
accuracy_test = np.sum(y_pred == y_test) / len(y_test)
print(f"Accuracy: {accuracy_test * 100:.2f}%")


Accuracy: 97.76%


In [188]:
#select 5 random samples from the testing set
random_samples = np.random.randint(0, len(y_test), 5)

#get the predictions and actual labels
predictions = y_pred[random_samples]
real_labels = y_test[random_samples]

#print results
for i in range(5):
    print(f"Sample {i+1}:")
    print(f"   Prediction: {predictions[i]}")
    print(f"   Actual label: {real_labels[i]}")


Sample 1:
   Prediction: 1
   Actual label: 1
Sample 2:
   Prediction: 1
   Actual label: 1
Sample 3:
   Prediction: 0
   Actual label: 0
Sample 4:
   Prediction: 0
   Actual label: 0
Sample 5:
   Prediction: 0
   Actual label: 0
