In [165]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/70k-job-applicants-data-human-resource/stackoverflow_full.csv


# Q1)

## Import dataset

In [87]:
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target

In [78]:
X,y

(array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
        [5

In [79]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # 80% training data and 20% test data

## 1. Random Forest

## From Scratch

In [80]:
# Gini impurity calculation

def gini(y):
    _, counts = np.unique(y, return_counts=True)
    probabilities = counts / counts.sum()
    return 1 - np.sum(np.square(probabilities))


# Information gain using Gini impurity

def gini_gain(X_column, y, threshold):
    # Create split
    left_mask = X_column < threshold
    right_mask = ~left_mask

    left_y = y[left_mask]
    right_y = y[right_mask]

    left_impurity = gini(left_y)
    right_impurity = gini(right_y)

    n = len(y)
    n_left = len(left_y)
    n_right = len(right_y)

    # Weighted impurity
    weighted_impurity = (n_left / n) * left_impurity + (n_right / n) * right_impurity

    return gini(y) - weighted_impurity


# CART Decision Tree class

class CARTTree:
    def __init__(self, max_depth=5, max_features=None):
        self.max_depth = max_depth
        self.tree = None
        self.max_features = max_features

    def fit(self, X, y):
        self.tree = self._grow_tree(X, y)

    def _grow_tree(self, X, y, depth=0):
        num_samples, num_features = X.shape
        if depth >= self.max_depth or len(np.unique(y)) == 1:
            return np.bincount(y).argmax()  # Return the most common target class
        
        # Selecting the best split
        best_gain = -1
        split_idx, split_threshold = None, None
        for feature_index in range(num_features):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                gain = gini_gain(X[:, feature_index], y, threshold)
                if gain > best_gain:
                    best_gain = gain
                    split_idx = feature_index
                    split_threshold = threshold

        if best_gain == -1:
            return np.bincount(y).argmax()  # Return the most common target class

        # Recursively grow the tree
        left_mask = X[:, split_idx] < split_threshold
        right_mask = ~left_mask
        left_subtree = self._grow_tree(X[left_mask], y[left_mask], depth + 1)
        right_subtree = self._grow_tree(X[right_mask], y[right_mask], depth + 1)

        # Return a node dictionary
        return {"feature_index": split_idx, "threshold": split_threshold, "left": left_subtree, "right": right_subtree}

    def _predict(self, x, tree):
        if not isinstance(tree, dict):
            return tree
        feature_idx = tree['feature_index']
        threshold = tree['threshold']
        if x[feature_idx] < threshold:
            return self._predict(x, tree['left'])
        else:
            return self._predict(x, tree['right'])

    def predict(self, X):
        return np.array([self._predict(x, self.tree) for x in X])

In [81]:
# Random Forest class

class RandomForest:

    def __init__(self, num_trees=10, max_depth=10, max_features=None):

        self.num_trees = num_trees

        self.max_depth = max_depth

        self.max_features = max_features

        self.trees = []
        

    def _bootstrap_sample(self, X, y):
        n_samples = X.shape[0]
        idxs = np.random.choice(n_samples, size=n_samples, replace=True)
        return X[idxs], y[idxs]


    def fit(self, X, y):
        self.trees = []
        for _ in range(self.num_trees):
            tree = CARTTree(max_depth=self.max_depth, max_features=self.max_features)
            X_sample, y_sample = self._bootstrap_sample(X, y)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)


    def _predict_tree(self, x):
     tree_predictions = [tree._predict(x, tree.tree) for tree in self.trees]
     return np.bincount(tree_predictions).argmax()  # Majority vote


    def predict(self, X):
        return np.array([self._predict_tree(x) for x in X])


In [82]:
# Train the Random Forest on breast cancer dataset

random_forest = RandomForest()

random_forest.fit(X_train, y_train)



# Predict on the test set

rf_predictions = random_forest.predict(X_test)



# Calculate accuracy

accuracy = np.mean(rf_predictions == y_test)

print("Random Forest Accuracy on Iris Dataset:", accuracy)

Random Forest Accuracy on Iris Dataset: 1.0


## sklearn

In [100]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=2, random_state=0)

clf.fit(X_train, y_train)
RandomForestClassifier(...)
y_pred = clf.predict(X_test)
print(y_pred)

[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0 0 0 2 1 1 0 0]


In [101]:
# Calculate accuracy

accuracy = np.mean(y_pred == y_test)

print("Random Forest Accuracy on Iris Dataset:", accuracy)

Random Forest Accuracy on Iris Dataset: 1.0


### Both have accuracy 1

## 2. SVM

## From scratch

In [88]:
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df["target"] = iris.target
df
X = df.drop(columns = ["target"])
y = df["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # 80% training data and 20% test data

In [162]:

# SVM Parameters
class LinearSVM:
    def __init__(self, learning_rate=0.01, num_iterations=1000, C=1.0):
        self.alpha = learning_rate
        self.num_iter = num_iterations
        self.C = C
        # Initialize weights and bias
        self.W = None
        self.b = None

    def fit(self, X, y):
        num_samples, num_features = X.shape
        # Initialize weights and bias
        self.W = np.zeros(num_features)
        self.b = 0

        for _ in range(self.num_iter):
            loss = self.hinge_loss(self.W, self.b, X, y)
            grad_W, grad_b = self.gradient(X, y)
            
            self.W = self.W - self.alpha * grad_W
            self.b = self.b - self.alpha * grad_b
            

    # Hinge Loss function
    def hinge_loss(self, w, b, X, y):
        distances = 1 - y * (np.dot(X, w) + b)
        loss = self.C*np.sum(np.maximum(0, distances))  # hinge loss
        reg_loss = np.dot(w, w)  # Regularization term
        return loss + reg_loss
        
    # Gradient Descent Update
    def gradient(self, X, y):
        grad_W = np.zeros(len(self.W))
        grad_b = 0

        for i in range(len(y)):
            if y[i] * (np.dot(X[i], self.W) + self.b) < 1:
                grad_W = grad_W - y[i] * X[i]
                grad_b = grad_b - y[i]
            else:
                grad_W = grad_W + (1/ (2*(self.C)) )* self.W
        
        # Average gradients and include regularization term
        #grad_b = grad_b / X.shape[0]  # Average bias gradient
        #grad_W = grad_W / X.shape[0]
        
        return grad_W, grad_b

    # Prediction Function
    def predict(self, X):
        linear_output = np.dot(X, self.W) + self.b
        return np.where(linear_output >= 0, 1, -1)

    # Evaluation
    def evaluate(self, X_test, y_test):
        predictions = self.predict(X_test)
        accuracy = np.mean(predictions == y_test)
        return (predictions, accuracy)


In [163]:
svm = LinearSVM(learning_rate=0.001, num_iterations=1000)
svm.fit(X_train.values, y_train.values)

### Predictions

In [164]:
y_pred = svm.predict(X_test)
print("Accuracy for SVM model: ", np.mean(y_pred == y_test))

Accuracy for SVM model:  0.9111111111111111


## Using sklearn library

In [97]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

svm_ovo = SVC(decision_function_shape='ovo')
svm_ovo.fit(X_train, y_train)

# Predict and evaluate the One-vs-One model
y_pred_ovo = svm_ovo.predict(X_test)
print("One-vs-One Accuracy:", accuracy_score(y_test, y_pred_ovo))

One-vs-One Accuracy: 1.0


In [98]:
# Initialize the SVM classifier with One-vs-All strategy
svm_ova = SVC(decision_function_shape='ovr')
svm_ova.fit(X_train, y_train)

# Predict and evaluate the One-vs-All model
y_pred_ova = svm_ova.predict(X_test)
print("One-vs-All Accuracy:", accuracy_score(y_test, y_pred_ova))

One-vs-All Accuracy: 1.0


## Classification Tree

## Scratch

In [86]:
cart_tree = CARTTree()

cart_tree.fit(X_train, y_train)



# Predict on training data

predictions = cart_tree.predict(X_test)



# Calculate accuracy

accuracy = np.mean(predictions == y_test)

print("CART Decision Tree Accuracy on Iris Dataset:", accuracy)

CART Decision Tree Accuracy on Iris Dataset: 0.9777777777777777


## Sklearn

In [102]:

from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = np.mean(y_pred == y_test)
print(accuracy)

1.0


## 4. Logistic Regression



## Scratch

In [142]:
import numpy as np

class LogisticRegression:
    def __init__(self, learning_rate=0.01, epochs=1000):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.weights = None
        self.bias = None
        self.num_classes = None
    
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        m, n = X.shape
        self.num_classes = len(np.unique(y))  # Number of classes
        self.weights = np.zeros((self.num_classes, n))  # weights for each class
        self.bias = np.zeros(self.num_classes)  # bias for each class
        
        for c in range(self.num_classes):
            # Create binary labels for class c vs all others
            y_binary = np.where(y == c, 1, 0)
            
            # Initialize the weights and bias for this class
            self.weights[c] = np.zeros(n)
            self.bias[c] = 0
            
            # Train the classifier for class c
            for _ in range(self.epochs):
                # Linear model for class c
                linear_model = np.dot(X, self.weights[c]) + self.bias[c]
                y_predicted = self.sigmoid(linear_model)
                
                # Compute the gradients
                dw = (1/m) * np.dot(X.T, (y_predicted - y_binary))
                db = (1/m) * np.sum(y_predicted - y_binary)
                
                # Update the parameters for class c
                self.weights[c] -= self.learning_rate * dw
                self.bias[c] -= self.learning_rate * db
    
    def predict(self, X):
        # Calculate probabilities for each class
        scores = np.dot(X, self.weights.T) + self.bias  # Shape: (m, num_classes)
        probabilities = self.sigmoid(scores)  # Apply sigmoid to each class
        
        # Get the index of the class with the highest probability for each sample
        y_pred = np.argmax(probabilities, axis=1)
        return y_pred


In [143]:
# Assume X_train, X_test, y_train, y_test are already defined and are numpy arrays

# Train the logistic regression model
model = LogisticRegression(learning_rate=0.01, epochs=1000)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = np.mean(y_pred == y_test)
print("Multiclass Logistic Regression Accuracy:", accuracy)


Multiclass Logistic Regression Accuracy: 0.8666666666666667


## Sklearn

In [121]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0).fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = np.mean(y_pred == y_test)
print("Accuracy by logistic regression model: ", accuracy)

Accuracy by logistic regression model:  1.0


## 5. LDA

## Sklearn

In [123]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
clf = LinearDiscriminantAnalysis()
clf.fit(X_train, y_train)
y_pred = (clf.predict(X_test))
accuracy = np.mean(y_pred == y_test)
print("Accuracy of LDA: ", accuracy)

Accuracy of LDA:  1.0


## 6. QDA

## Sklearn

In [124]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

clf = QuadraticDiscriminantAnalysis()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy of QDA: ", np.mean(y_test == y_pred))

Accuracy of QDA:  1.0


## 7. KNN

## Scratch

In [150]:
# sort elements of matrix and corresponding labels in ascending order.
def get_sorted_train_labels(distance_matrix, labels):
    sorted_labels = np.empty(distance_matrix.shape)
    #print(sorted_labels.shape[0], sorted_labels.shape[1])
    j = 0
    
    for row in range(distance_matrix.shape[0]):
        dict = {}
        for i in range(distance_matrix.shape[1]):
            dict[distance_matrix[row][i]] = y_train[i]
            
        dist_list = list(dict.keys())
        dist_list.sort()
        for ele in range(len(dist_list)):
            sorted_labels[j][ele] = int(dict[dist_list[ele]])    
        j = j + 1

        
    return sorted_labels
        
    #raise NotImplementedError

In [151]:
def pairwise_3(x, y):
    """
    An optimize solution with no for loops
  
    """
    dist_mat = np.sum((x[:, None] - y) ** 2, axis = -1) ** (1/2)
    return dist_mat

In [154]:
distance_matrix = pairwise_3(X_test.values, X_train.values)
distance_matrix

array([[1.2489996 , 0.53851648, 1.08627805, ..., 3.84447656, 0.78740079,
        1.81383571],
       [2.54754784, 3.78945906, 4.20356991, ..., 0.55677644, 2.74772633,
        4.84561658],
       [4.1       , 2.42487113, 2.03715488, ..., 6.5169011 , 3.6373067 ,
        1.2489996 ],
       ...,
       [1.78325545, 0.64031242, 0.90553851, ..., 3.94461658, 1.28062485,
        1.40356688],
       [2.89482297, 4.20356991, 4.60868745, ..., 0.46904158, 3.12729915,
        5.26782688],
       [2.6795522 , 4.03360881, 4.46318272, ..., 0.58309519, 2.93598365,
        5.13322511]])

In [None]:
sorted_labels = get_sorted_train_labels(distance_matrix, y_train.values)
sorted_labels

In [None]:
# First arrange the label sequence and then find the mode of k labels.
def get_mode_of_labels(labels, k):
    """
    Compute the mode of k labels
    """
    
    #print(labels)
    final_labels = []
    for row in labels:
        dict = {}
        #print(row)
        for j in row:
            #print(j)
            dict[j] = 0
            
        #print(len(row))
        for i in range(len(row)):
            if (i == (k - 1)):
                #print(i)
                break
            else:
                dict[row[i]] = dict[row[i]] + 1
                #print(dict[row[i]])

        maxm = 0
        label = 0   
        #print(dict)
        for key in dict.keys():
            if(dict[key] > maxm):
                label = key
                maxm = dict[key]
            elif(dict[key] == maxm):
                if(label > key or label == 0):
                    label = key
                
        #print(label)
        final_labels.append(int(label))
        #print(final_labels)

    #print(final_labels)
    return final_labels
    
   # raise NotImplementedError

In [None]:
# Now, using the functions created above, build a KNN.
def predict(x_test, x_train, y_train, k, distance_function):

    final_labels = []
    
    dist_mat = pairwise_3(x_test, x_train)
    get_sorted_labels = get_sorted_train_labels(dist_mat, y_train)
    final_labels = get_mode_of_labels(get_sorted_labels, k)
    return final_labels
   # raise NotImplementedError

In [None]:
prediction = predict(x_test, x_train, y_train, 3, pairwise_3)

## Sklearn

In [141]:

from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, y_train)
y_pred = (neigh.predict(X_test))
print("Accuracy of KNN sklearn: ", np.mean(y_test==y_pred))

Accuracy of KNN sklearn:  1.0


## 8. Naive Bayesian

## Scratch

In [137]:
class NaiveBayes:
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.classes = np.unique(y)
        n_classes = len(self.classes)
        
        # Initialize mean, variance, and prior
        self.mean = np.zeros(shape = (n_classes, n_features))
        self.variance = np.zeros(shape = (n_classes, n_features))
        self.prior = np.zeros(n_classes)
        
        for idx, cls in enumerate(self.classes):
            X_class = X[y == cls] # Data points which have cls as their class
            self.mean[idx, :] = X_class.mean(axis=0)
            self.variance[idx, :] = X_class.var(axis=0)
            self.prior[idx] = X_class.shape[0] / n_samples

    def _calculate_likelihood(self, class_idx, x):
        # Normal distribution PDF -> 1/sqrt(2*pi*variance) * (e^(-(x_std)**2/2))
        #print(class_idx)
        mean = self.mean[class_idx]
        variance = self.variance[class_idx]
        
        # Avoid division by zero in case of zero variance
        variance = np.where(variance == 0, 1e-10, variance)
        #print(x, mean)
        num = np.exp(-((x - mean) ** 2) / (2 * variance))
        deno = np.sqrt(2 * np.pi * variance)
        return num / deno

    def _calculate_posterior(self, x):
        posteriors = []
        for idx, cls in enumerate(self.classes):
            prior = self.prior[idx]
            likelihood = self._calculate_likelihood(idx, x) # likelihood that this particular datasample is for this particular class
            # P(theta/D) = P(D/theta) * P(theta) / P(D) -> P(D) is constant
            posterior = np.prod(likelihood) * prior
            posteriors.append(posterior)
        return posteriors
    
    def predict(self, X):
        predictions = []
        for x in X:
            posteriors = self._calculate_posterior(x)
            # predicted class is the class for which the posteriors is maximum
            predicted_class = self.classes[np.argmax(posteriors)]
            predictions.append(predicted_class)
        return np.array(predictions)


In [135]:
X_train.values

array([[5.5, 2.4, 3.7, 1. ],
       [6.3, 2.8, 5.1, 1.5],
       [6.4, 3.1, 5.5, 1.8],
       [6.6, 3. , 4.4, 1.4],
       [7.2, 3.6, 6.1, 2.5],
       [5.7, 2.9, 4.2, 1.3],
       [7.6, 3. , 6.6, 2.1],
       [5.6, 3. , 4.5, 1.5],
       [5.1, 3.5, 1.4, 0.2],
       [7.7, 2.8, 6.7, 2. ],
       [5.8, 2.7, 4.1, 1. ],
       [5.2, 3.4, 1.4, 0.2],
       [5. , 3.5, 1.3, 0.3],
       [5.1, 3.8, 1.9, 0.4],
       [5. , 2. , 3.5, 1. ],
       [6.3, 2.7, 4.9, 1.8],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [5.6, 2.7, 4.2, 1.3],
       [5.1, 3.4, 1.5, 0.2],
       [5.7, 3. , 4.2, 1.2],
       [7.7, 3.8, 6.7, 2.2],
       [4.6, 3.2, 1.4, 0.2],
       [6.2, 2.9, 4.3, 1.3],
       [5.7, 2.5, 5. , 2. ],
       [5.5, 4.2, 1.4, 0.2],
       [6. , 3. , 4.8, 1.8],
       [5.8, 2.7, 5.1, 1.9],
       [6. , 2.2, 4. , 1. ],
       [5.4, 3. , 4.5, 1.5],
       [6.2, 3.4, 5.4, 2.3],
       [5.5, 2.3, 4. , 1.3],
       [5.4, 3.9, 1.7, 0.4],
       [5. , 2

In [138]:
# Naive Bayes Model Test
model_naive_bayes = NaiveBayes()
model_naive_bayes.fit(X_train.values, y_train.values)
y_pred_naive_bayes = model_naive_bayes.predict(X_test.values)
print("Naive Bayes Predictions:", y_pred_naive_bayes)
# Output the accuracy and predictions
print("Naive Bayes Test Case Accuracy:", np.mean(y_test.values== y_pred_naive_bayes))


Naive Bayes Predictions: [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 2 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0 0 0 2 1 1 0 0]
Naive Bayes Test Case Accuracy: 0.9777777777777777


## Sklearn

In [140]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

y_pred = gnb.fit(X_train, y_train).predict(X_test)
print("Accuracy by Naive bayes sklearn: ", np.mean(y_test==y_pred))

Accuracy by Naive bayes sklearn:  0.9777777777777777


# Q2)

In [199]:
data = pd.read_csv("/kaggle/input/70k-job-applicants-data-human-resource/stackoverflow_full.csv")
data

Unnamed: 0.1,Unnamed: 0,Age,Accessibility,EdLevel,Employment,Gender,MentalHealth,MainBranch,YearsCode,YearsCodePro,Country,PreviousSalary,HaveWorkedWith,ComputerSkills,Employed
0,0,<35,No,Master,1,Man,No,Dev,7,4,Sweden,51552.0,C++;Python;Git;PostgreSQL,4,0
1,1,<35,No,Undergraduate,1,Man,No,Dev,12,5,Spain,46482.0,Bash/Shell;HTML/CSS;JavaScript;Node.js;SQL;Typ...,12,1
2,2,<35,No,Master,1,Man,No,Dev,15,6,Germany,77290.0,C;C++;Java;Perl;Ruby;Git;Ruby on Rails,7,0
3,3,<35,No,Undergraduate,1,Man,No,Dev,9,6,Canada,46135.0,Bash/Shell;HTML/CSS;JavaScript;PHP;Ruby;SQL;Gi...,13,0
4,4,>35,No,PhD,0,Man,No,NotDev,40,30,Singapore,160932.0,C++;Python,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73457,73457,<35,No,Undergraduate,1,Man,No,Dev,7,2,Germany,41058.0,C#;HTML/CSS;JavaScript;TypeScript;Docker;Kuber...,13,1
73458,73458,>35,No,Undergraduate,1,Man,No,Dev,21,16,United States of America,115000.0,C#;HTML/CSS;Java;JavaScript;npm;ASP.NET Core ;...,11,1
73459,73459,<35,No,Undergraduate,1,Man,No,Dev,4,3,Nigeria,57720.0,HTML/CSS;JavaScript;TypeScript;Docker;Express;...,12,1
73460,73460,<35,Yes,Undergraduate,1,Man,Yes,Dev,5,1,United States of America,70000.0,C#;HTML/CSS;JavaScript;SQL;TypeScript;npm;Yarn...,15,1


In [200]:
columns = data.columns

In [201]:
categorical_columns = []

for col in columns:
    print(data[col].dtype)
    if(data[col].dtype == "object" and col != "HaveWorkedWith"):
        categorical_columns.append(col)
        

int64
object
object
object
int64
object
object
object
int64
int64
object
float64
object
int64
int64


In [202]:
# Drop unnecessary columns, such as 'Unnamed: 0' (if not needed)
df = data
df = df.drop(columns=['Unnamed: 0'])

# One-hot encode categorical columns
df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)
#print(df_encoded)

# Handle any other specific transformations (if necessary)
# For example, handling `HaveWorkedWith` as a text field
#df_encoded['HaveWorkedWith'] = df_encoded['HaveWorkedWith'].apply(lambda x: x.split(';'))

# Show the transformed DataFrame
print(df_encoded.head())

   Employment  YearsCode  YearsCodePro  PreviousSalary  \
0           1          7             4         51552.0   
1           1         12             5         46482.0   
2           1         15             6         77290.0   
3           1          9             6         46135.0   
4           0         40            30        160932.0   

                                      HaveWorkedWith  ComputerSkills  \
0                          C++;Python;Git;PostgreSQL               4   
1  Bash/Shell;HTML/CSS;JavaScript;Node.js;SQL;Typ...              12   
2             C;C++;Java;Perl;Ruby;Git;Ruby on Rails               7   
3  Bash/Shell;HTML/CSS;JavaScript;PHP;Ruby;SQL;Gi...              13   
4                                         C++;Python               2   

   Employed  Age_>35  Accessibility_Yes  EdLevel_NoHigherEd  ...  \
0         0    False              False               False  ...   
1         1    False              False               False  ...   
2         0   

In [203]:
languages_split = df['HaveWorkedWith'].str.split(';', expand=True)
languages_flat = languages_split.melt(value_name='language')['language']

unique_languages = languages_flat.dropna().unique()
print(unique_languages, unique_languages.shape)


['C++' 'Bash/Shell' 'C' 'JavaScript' 'Python' 'Delphi' 'Assembly' 'C#'
 'HTML/CSS' 'Dart' 'Scala' 'Go' 'Kotlin' 'SQL' 'Matlab' 'Groovy' 'Java'
 'Ansible' 'Haskell' 'R' 'Objective-C' 'LISP' 'APL' 'Elixir' 'Swift' 'Git'
 'PHP' 'Clojure' 'PowerShell' 'Crystal' 'Erlang' 'VBA' 'Rust' 'TypeScript'
 'Ruby' 'Node.js' 'Julia' 'Perl' 'F#' 'COBOL' 'Docker' 'Microsoft Azure'
 'Microsoft SQL Server' 'Drupal' 'Laravel' 'ASP.NET' 'AWS' 'MySQL'
 'Angular' 'Oracle' 'Xamarin' 'Kubernetes' 'DigitalOcean' 'Yarn' 'SQLite'
 'jQuery' 'VMware' 'MATLAB' 'Fortran' 'OCaml' 'npm' 'Lua' 'Puppet' 'Flow'
 'Django' 'Unreal Engine' 'Terraform' 'React.js' 'SAS' 'Chef' 'Express'
 'OVH' 'Homebrew' 'ASP.NET Core ' 'Spring' 'Firebase' 'PostgreSQL'
 'FastAPI' 'Heroku' 'Unity 3D' 'Ruby on Rails'
 'Oracle Cloud Infrastructure' 'IBM DB2' 'Vue.js' 'Google Cloud Platform'
 'Redis' 'Elasticsearch' 'Deno' 'Gatsby' 'Angular.js' 'Cassandra'
 'MongoDB' 'MariaDB' 'Symfony' 'Flask' 'Next.js'
 'Firebase Realtime Database' 'Solidity' 'Go

In [205]:
multi_hot_encoded = df_encoded['HaveWorkedWith'].str.get_dummies(sep=";")
df_encoded = pd.concat([df_encoded, multi_hot_encoded], axis = 1)
df_encoded.columns

Index(['Employment', 'YearsCode', 'YearsCodePro', 'PreviousSalary',
       'HaveWorkedWith', 'ComputerSkills', 'Employed', 'Age_>35',
       'Accessibility_Yes', 'EdLevel_NoHigherEd',
       ...
       'TypeScript', 'Unity 3D', 'Unreal Engine', 'VBA', 'VMware', 'Vue.js',
       'Xamarin', 'Yarn', 'jQuery', 'npm'],
      dtype='object', length=420)

In [210]:
df_encoded.drop(columns = ["HaveWorkedWith"],axis=1, inplace = True)

In [211]:
X = df_encoded.drop(columns = ["Employed"])
y = df_encoded["Employed"]

In [212]:
from sklearn.model_selection import train_test_split, cross_val_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # 80% training data and 20% test data

In [213]:
df_encoded

Unnamed: 0,Employment,YearsCode,YearsCodePro,PreviousSalary,ComputerSkills,Employed,Age_>35,Accessibility_Yes,EdLevel_NoHigherEd,EdLevel_Other,...,TypeScript,Unity 3D,Unreal Engine,VBA,VMware,Vue.js,Xamarin,Yarn,jQuery,npm
0,1,7,4,51552.0,4,0,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0
1,1,12,5,46482.0,12,1,False,False,False,False,...,1,0,0,0,0,1,0,0,0,0
2,1,15,6,77290.0,7,0,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0
3,1,9,6,46135.0,13,0,False,False,False,False,...,0,0,0,0,0,0,0,0,1,0
4,0,40,30,160932.0,2,0,True,False,False,False,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73457,1,7,2,41058.0,13,1,False,False,False,False,...,1,0,0,0,0,0,0,0,0,1
73458,1,21,16,115000.0,11,1,True,False,False,False,...,0,0,0,0,0,0,0,0,0,1
73459,1,4,3,57720.0,12,1,False,False,False,False,...,1,0,0,0,0,0,0,0,0,0
73460,1,5,1,70000.0,15,1,False,True,False,False,...,1,0,0,0,0,0,0,1,1,1


## Random forest

In [216]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_split = accuracy_score(y_test, y_pred)

# 10-fold cross-validation
cross_val_accuracy = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy').mean()


## SVM

In [None]:
# Initialize the SVM classifier with One-vs-All strategy
svm_ova = SVC(decision_function_shape='ovr')
svm_ova.fit(X_train, y_train)

# Predict and evaluate the One-vs-All model
y_pred_ova = svm_ova.predict(X_test)
cross_val_accuracy = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy').mean()
print("cross_val_accuracy: ",cross_val_accuracy)

## Classification Tree

In [None]:

from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
cross_val_accuracy = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy').mean()
print("cross_val_accuracy: ",cross_val_accuracy)

## Logistic Regression

In [None]:
# Assume X_train, X_test, y_train, y_test are already defined and are numpy arrays

# Train the logistic regression model
model = LogisticRegression(learning_rate=0.01, epochs=1000)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
cross_val_accuracy = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy').mean()
print("cross_val_accuracy: ",cross_val_accuracy)


## LDA

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
clf = LinearDiscriminantAnalysis()
clf.fit(X_train, y_train)
y_pred = (clf.predict(X_test))

# Calculate accuracy
cross_val_accuracy = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy').mean()
print("cross_val_accuracy: ",cross_val_accuracy)

## QDA

In [None]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

clf = QuadraticDiscriminantAnalysis()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
cross_val_accuracy = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy').mean()
print("cross_val_accuracy: ",cross_val_accuracy)

## KNN

In [None]:

from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, y_train)
y_pred = (neigh.predict(X_test))
cross_val_accuracy = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy').mean()
print("cross_val_accuracy: ",cross_val_accuracy)

## Naive Bayesian

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

y_pred = gnb.fit(X_train, y_train).predict(X_test)
cross_val_accuracy = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy').mean()
print("cross_val_accuracy: ",cross_val_accuracy)

# Q3)

In [156]:
import torch
torch.cuda.is_available()


False

In [157]:
torch.cuda.device_count()


0

In [158]:
torch.cuda.current_device()


RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [None]:
torch.cuda.device(0)


In [159]:
torch.cuda.get_device_name(0)


RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [160]:
import tensorflow as tf
tf.test.is_gpu_available()


False

In [161]:
tf.test.is_built_with_cuda()

True