In [None]:
# All important libraries goes here!
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
pd.options.display.float_format = '{:.2f}'.format

In [None]:
dataframe = pd.read_csv('./Data/washedData.csv')
dataframe = dataframe.drop(columns='ID')

clean_data = pd.read_csv('./Data/cleaned-data.csv')


#### Problem Statement
We are trying to understand the factors that influence whether a company has affected employees.

This is important because companies with affected employees may require additional support or interventions.

By identifying the key factors, we can target our interventions more effectively and potentially prevent employees from being affected in the future.

To solve this problem, we will use this dataset to build a predictive model.

This model will take as input the various financial and operational characteristics of a company and output a prediction of whether the company has affected employees.

We can then use this model to predict the status of new companies and guide our interventions.


<br>
<br>

#### (a) Data cleaning


##### (a) (ii) looking if there are missing values in each column of the dataframe

In [None]:
dataframe.info()

From the results it shows that there arent any missing values, since the total entries = Not-Null Count of 4137

<br>

##### (a) (iii) Checking for duplicate rows

In [None]:
dataframe.duplicated().sum()

There are 1075 duplicates in the dataset, Those will be removed

In [None]:
dataframe.drop_duplicates(inplace=True)
dataframe.duplicated().sum()

<br>

##### (a) (iv) Ensuring data consistency and removing null values

These attributes may contain some inconsistencies, for the datatype has to be an integer.

 2.   Annual turnover    4129 non-null   object
 3.   TCTC               4123 non-null   object
 4.  Basic Salary       4135 non-null   object

In [None]:
dataframe['Annual turnover'] = pd.to_numeric(dataframe['Annual turnover'], errors='coerce')
dataframe['TCTC'] = pd.to_numeric(dataframe['TCTC'], errors='coerce')
dataframe['Basic Salary'] = pd.to_numeric(dataframe['Basic Salary'], errors='coerce')

dataframe.dropna(inplace=True)

dataframe.head(5)

<br>

##### (a) (v) Removing outliers

In [None]:
def remove_outliers(dataframe, column):
    Q1 = dataframe[column].quantile(0.25)
    Q3 = dataframe[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    dataframe = dataframe[(dataframe[column] >= lower_bound) & (dataframe[column] <= upper_bound)]
    return dataframe

for column in ['No of employee', 'Annual turnover', 'TCTC', 'Basic Salary']:
    dataframe = remove_outliers(dataframe, column)

dataframe.to_csv('./Data/cleaned-data.csv', index=False)

<br>
<br>

#### (b) Statistical Analysis

In [None]:
dataframe.describe()

**Number of Employees:**
- The data covers a total of 1,584 businesses.
- On average, each business has approximately 5.47 employees.
- The number of employees varies significantly, ranging from 1 employee to a maximum of 26 employees in a single business.

**Annual Turnover (Revenue):**
- The average annual turnover (revenue) for businesses is approximately 525,426.92 Namibian Dollars.
- The range of annual turnovers is substantial, with some businesses reporting negative turnover (indicating losses) and others earning up to 6,100,000 Namibian Dollars annually.

**Total Compensation for Employees:**
- On average, each employee receives about 19,281.95 Namibian Dollars as total compensation annually.
- The total compensation varies widely among employees, with some receiving no compensation and the highest-earning employee receiving 153,192 Namibian Dollars annually.

**Basic Salary:**
- The average basic salary for employees is around 14,758.83 Namibian Dollars.
- The range of basic salaries is extensive, with some employees not receiving any basic salary and others receiving up to 75,800 Namibian Dollars.

**Cash Injection:**
- On average, businesses received cash injections or financial support about 71% of the time. This suggests that many businesses received external financial assistance.

**Contribution Waiver:**
- Businesses applied contribution waivers approximately 70% of the time. This indicates that a significant portion of businesses waived certain contributions.

**Affected Employees:**
- On average, about 91% of businesses reported that their employees were affected by certain conditions or changes in the business.


<br>
<br>

#### (c) Exploratory Data Analysis (EDA)


##### (c) (i) Employment distribution

In [None]:
data = clean_data['No of employee']

sns.histplot(data, kde=True, bins=10)
plt.title("No of employee Distribution")
plt.xlabel("No of employee")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

Most companies have fewer than 10 employees, with a significant number having just 1 employee.

The distribution is right-skewed, meaning there are a few companies with a large number of employees.
<br>


##### (c) (iii) Anual Turnover distribution

In [None]:
data = clean_data['Annual turnover']

sns.histplot(data, kde=True, bins=10)
plt.title("Annual Turnover Distribution")
plt.xlabel("Annual Turnover")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

Most companies have an annual turnover of less than 1,000,000.

The distribution is right-skewed, indicating that there are a few companies with a very high annual turnover.
<br>


##### (c) (iv) Total Cost to Company (TCTC) distribution

In [None]:
data = clean_data['TCTC']

sns.histplot(data, kde=True, bins=10)
plt.title("Total Cost To Company (TCTC)")
plt.xlabel("TCTC")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

The Total Cost to Company (TCTC) for most companies is less than 20,000.

The distribution is right-skewed, indicating that there are a few companies with a very high TCTC.

<br>


##### (c) (v) Basic Salary distribution

In [None]:
data = clean_data['Basic Salary']

sns.histplot(data, kde=True, bins=10)
plt.title("Basic Salary Distribution")
plt.xlabel("Basic Salary")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

Most companies have a basic salary of less than 20,000.

The distribution is right-skewed, indicating that there are a few companies with a very high basic salary.

<br>


##### (c) (vi) Cash Injection distribution

In [None]:
data = clean_data['Cash Injection']

sns.histplot(data, kde=True, bins=2)
plt.title("Cash Injection Distribution")
plt.xlabel("Cash Injectionr")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

This is a binary variable where 1 means a cash injection has happened, and 0 means it hasn't.

This is a binary variable where 1 indicates that companies have cash injection, while "0" indicates that they do not have a cash injection.

<br>


##### (c) (vii) Contribution Waiver distribution

In [None]:
data = clean_data['Contrib Waiver']

sns.histplot(data, kde=True, bins=2)
plt.title("Contribution Waiver Distribution")
plt.xlabel("Contribution Waiver")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

This is a binary variable where 1 indicates that companies have a contribution waiver, while "0" indicates that they do not have a contribution waiver.

This is a binary variable where 1 indicates that companies have a contribution waiver, while "0" indicates that they do not have a contribution waiver.

<br>


##### (c) (viii) Affected Employee distribution

In [None]:
data = clean_data['Affected Employee']

sns.histplot(data, kde=True, bins=2)
plt.title("Contribution Waiver Distribution")
plt.xlabel("Contribution Waiver")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

This is another binary variable, with 1 indicating that an employee has been affected and 0 indicating that they have not.

The histogram shows that most companies have had an affected employee (value of 1).

<br>


##### (c) (ix) Measuring correlations between the variables

In [None]:
corr_matrix = clean_data.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation matrix')


**Employee-Turnover Correlation**
- There is a positive correlation between **No of employee** and **Annual turnover**.
- This suggests that companies with more employees tend to have higher annual turnover.

**Employee-TCTC and Basic Salary Correlation**
- There is also a positive correlation between **No of employee** and **TCTC**, **Basic Salary**.
- This suggests that companies with more employees tend to have higher **TCTC** and **Basic Salary**.

**Binary Variables Correlation**
- **Cash Injection**, **Contrib Waiver**, and **Affected Employee** are binary variables and do not show any clear correlation with other variables.

**TCTC-Basic Salary Correlation** 
- There is a positive correlation between **TCTC** and **Basic Salary**.
- This suggests that companies with higher TCTC tend to offer higher Basic Salary.

 Turnover-TCTC and Basic Salary Correlation: 
- There is no clear correlation between **Annual turnover** and **TCTC**, **Basic Salary**.
- This suggests that the annual turnover of a company does not necessarily depend on the **TCTC** or **Basic Salary**.

**Cash Injection, Contrib Waiver, and Affected Employee Correlation**
- There is no clear correlation between **Cash Injection**, **Contrib Waiver**, and **Affected Employee**.
- This suggests that these variables do not have a significant influence on each other.

<br>
<br>

#### Machine Learning

##### Predictive Modeling for Annual Turnover

Using **Annual Turnover** as the target variable (what you want to predict) and use other features as predictors.

* No of Employees
* TCTC
* Basic Salary
* Cash Injection
* Contrib Waiver

we can use regression algorithms like linear regression, decision trees, or random forests to build the model.

<br>

##### Employee Classification

We can use machine learning to classify employees into different categories based

* Cash Injection
* Contrib Waiver
* Affected Employee

We might want to classify employees into **Highly Affected** and **Less Affected** categories.
We can use classification algorithms like logistic regression, decision trees, or support vector machines.

<br>

##### Employee Segmentation

Clustering techniques like K-means clustering can be used to segment employees based on their characteristics.

We can use features like:

* No of Employees
* TCTC
* Basic Salary

To create meaningful clusters

In [None]:
# Code goes here!

<br>
<br>

#### Evaluation of Machine Learning

Present performance metrics (e.g., Mean Absolute Error, R-squared) for each algorithm used.

Explain what the results mean:

* Which algorithm performed better?
* How accurate is the prediction of turnover?

<br>
<br>

#### Presentation of Results

Summarize key findings:

* Trends in employee turnover.
* Compensation fairness insights.

Mention any actionable recommendations based on the analysis.

In [94]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
data = clean_data

# Define features and create a binary target variable
X = data[['No of employee', 'TCTC', 'Basic Salary']]
# Set a threshold value for annual turnover to define "doing well" (1) or "not doing well" (0)
threshold = 1000000  # Adjust the threshold as needed
y = (data['Annual turnover'] > threshold).astype(int)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train a Random Forest classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print the results
print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)


Accuracy: 0.832807570977918
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.95      0.90       528
           1       0.50      0.24      0.32       106

    accuracy                           0.83       634
   macro avg       0.68      0.59      0.61       634
weighted avg       0.80      0.83      0.81       634



In [96]:
# Assuming you have already trained and loaded your RandomForestClassifier as 'model'

# Define the new data
new_data = pd.DataFrame({
    'No of employee': [2],
    'TCTC': [200000000],
    'Basic Salary': [2000]
})

# Standardize the new data using the same scaler used for training
new_data = scaler.transform(new_data)

# Make predictions for the new data point
prediction = model.predict(new_data)

# Interpret the prediction
if prediction[0] == 1:
    print("The company is predicted to be doing well.")
else:
    print("The company is predicted to not be doing well.")
    


The company is predicted to not be doing well.


In [59]:
clean_data

Unnamed: 0,No of employee,Annual turnover,TCTC,Basic Salary,Cash Injection,Contrib Waiver,Affected Employee
0,1,1500000.00,3000.00,3000.00,1,1,1
1,9,0.00,0.00,0.00,0,0,1
2,16,0.00,1500.00,1500.00,1,0,1
3,2,36000.00,3600.00,3600.00,0,0,1
4,16,798783.03,9848.22,9880.00,1,1,1
...,...,...,...,...,...,...,...
1579,2,13400.00,7518.00,2000.00,1,0,1
1580,3,12000.00,1500.00,300.00,1,0,1
1581,1,508580.00,35350.57,27856.33,1,1,1
1582,2,550992.00,45916.00,45916.00,1,1,1
