In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix



In [2]:
# Load your dataset
file_path = 'corporate.csv' 
data = pd.read_csv(file_path)
print(data)

   Company ID Corporate Income Tax Rate   Net Sales Operating Expenses  \
0     ABC-001                       20%  $5,000,000         $2,000,000   
1     DEF-002                       18%  $7,000,000         $2,500,000   
2     GHI-003                       22%  $8,000,000         $3,000,000   
3     JKL-004                       17%  $6,000,000         $2,200,000   
4     MNO-005                       21%  $9,000,000         $3,500,000   
5     MNO-006                       14%  $8,000,000         $2,500,000   
6     MNO-007                       23%  $5,000,000         $4,500,000   
7     MNO-008                       21%  $6,000,000         $2,500,000   
8     MNO-009                       16%  $5,000,000         $1,500,000   
9     MNO-010                       15%  $3,000,000         $1,500,000   
10    MNO-011                       21%  $4,000,000         $3,000,000   
11    MNO-012                       20%  $9,000,000         $3,500,000   
12    MNO-013                       19

In [3]:
# Convert columns with percentage signs to numeric after removing the percentage sign
percentage_columns = ['Corporate Income Tax Rate', 'Effective Tax Rate', 'Gross Profit Margin', 'Return on Assets (ROA)']
for col in percentage_columns:
    data[col] = data[col].str.replace('%', '').astype(float)

In [4]:
# Convert columns with currency signs to numeric after removing the currency sign and commas
currency_columns = ['Net Sales', 'Operating Expenses', 'Other Income', 'Operating Cash Flow', 'Current Assets', 'Current Liabilities']
for col in currency_columns:
    data[col] = data[col].replace({'\$': '', ',': ''}, regex=True).astype(float)

In [5]:
# Remove non-numeric columns or columns with non-essential information
data = data.drop(['Company ID', 'Business Location', 'Industry Type'], axis=1)
label_encoder = LabelEncoder()
data['Changes in Key Personnel'] = label_encoder.fit_transform(data['Changes in Key Personnel'])

In [6]:
print(data)
print(data.isnull().sum())


    Corporate Income Tax Rate  Net Sales  Operating Expenses  Other Income  \
0                        20.0  5000000.0           2000000.0      100000.0   
1                        18.0  7000000.0           2500000.0      150000.0   
2                        22.0  8000000.0           3000000.0      200000.0   
3                        17.0  6000000.0           2200000.0      120000.0   
4                        21.0  9000000.0           3500000.0      250000.0   
5                        14.0  8000000.0           2500000.0      350000.0   
6                        23.0  5000000.0           4500000.0      250000.0   
7                        21.0  6000000.0           2500000.0      150000.0   
8                        16.0  5000000.0           1500000.0      180000.0   
9                        15.0  3000000.0           1500000.0      140000.0   
10                       21.0  4000000.0           3000000.0      150000.0   
11                       20.0  9000000.0           3500000.0    

In [7]:
datas = data.fillna(data.mean())
print(datas)

    Corporate Income Tax Rate  Net Sales  Operating Expenses  Other Income  \
0                        20.0  5000000.0           2000000.0      100000.0   
1                        18.0  7000000.0           2500000.0      150000.0   
2                        22.0  8000000.0           3000000.0      200000.0   
3                        17.0  6000000.0           2200000.0      120000.0   
4                        21.0  9000000.0           3500000.0      250000.0   
5                        14.0  8000000.0           2500000.0      350000.0   
6                        23.0  5000000.0           4500000.0      250000.0   
7                        21.0  6000000.0           2500000.0      150000.0   
8                        16.0  5000000.0           1500000.0      180000.0   
9                        15.0  3000000.0           1500000.0      140000.0   
10                       21.0  4000000.0           3000000.0      150000.0   
11                       20.0  9000000.0           3500000.0    

In [8]:
# Separate features and target variable
X = datas.drop('Changes in Key Personnel', axis=1)
y = datas['Changes in Key Personnel']

In [9]:
# Feature scaling for better model performance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [10]:
print(X_train)
print(y_train)
print(X_test)
print(y_test)

[[ 0.61075185  1.1249969   0.4426827   0.80946058 -1.68634964  1.1916484
   0.761641    0.18043874 -0.63233082  0.72531852  0.87082768 -0.45062974]
 [ 1.20371482  1.59768467  1.03135651  0.25805691  0.75932318 -0.82581503
   1.02794905  2.24259579  0.98903026  0.72531852  1.47139849  3.06991509]
 [ 0.61075185 -1.71112973  1.03135651  0.80946058  0.06055952  0.65365815
  -0.30359117 -0.33510052 -0.02432042  0.72531852 -1.23117017 -0.45062974]
 [-0.27869259 -0.29306642 -1.32333871 -0.51390822 -0.75466475 -0.15332722
  -0.96936128 -0.85063978  1.59704067 -0.92313266  1.47139849  1.19229118]
 [ 0.01778889  0.17962135 -0.49919539 -1.17559261  1.45808684 -0.42232234
  -0.56989921  0.18043874 -0.83500096  0.06593805 -0.33031395  0.25347923]
 [ 1.79667778 -0.29306642  2.20870412  0.25805691  0.40994135 -1.22930772
  -0.30359117 -0.33510052 -1.0376711  -0.26375219 -0.33031395 -0.45062974]
 [ 1.20371482  0.65230913  0.4426827   0.25805691 -2.26865269 -1.49830284
   1.02794905 -0.59287015 -0.4296

In [11]:
print(datas.isnull().sum())


Corporate Income Tax Rate    0
Net Sales                    0
Operating Expenses           0
Other Income                 0
Gross Profit Margin          0
Operating Cash Flow          0
Current Assets               0
Current Liabilities          0
Effective Tax Rate           0
Inventory Turnover           0
Return on Assets (ROA)       0
Number of Employees          0
Changes in Key Personnel     0
dtype: int64


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the Random Forest Classifier
clf = RandomForestClassifier(random_state=32)

# Fit the model on the training data
clf.fit(X_train,y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)


# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")



In [13]:
# Generate and print the classification report and confusion matrix
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))



Classification Report:
              precision    recall  f1-score   support

           0       0.71      1.00      0.83         5
           1       1.00      0.60      0.75         5

    accuracy                           0.80        10
   macro avg       0.86      0.80      0.79        10
weighted avg       0.86      0.80      0.79        10


Confusion Matrix:
[[5 0]
 [2 3]]


In [None]:
# Initializing Random Forest and Gradient Boosting Classifiers
rf_clf = RandomForestClassifier(random_state=32)
gb_clf = GradientBoostingClassifier(random_state=32)

# Fitting the models
rf_clf.fit(X_train, y_train)
gb_clf.fit(X_train, y_train)




In [15]:
rf_y_pred = rf_clf.predict(X_test)
gb_y_pred = gb_clf.predict(X_test)


In [16]:
# Evaluating Gradient Boosting Classifier
gb_accuracy = accuracy_score(y_test, gb_y_pred)
print(f"\nGradient Boosting Classifier Accuracy: {gb_accuracy:.2f}")
print("\nGradient Boosting Classifier Report:")
print(classification_report(y_test, gb_y_pred))
print("\nGradient Boosting Classifier Confusion Matrix:")
print(confusion_matrix(y_test, gb_y_pred))


Gradient Boosting Classifier Accuracy: 0.70

Gradient Boosting Classifier Report:
              precision    recall  f1-score   support

           0       0.62      1.00      0.77         5
           1       1.00      0.40      0.57         5

    accuracy                           0.70        10
   macro avg       0.81      0.70      0.67        10
weighted avg       0.81      0.70      0.67        10


Gradient Boosting Classifier Confusion Matrix:
[[5 0]
 [3 2]]


In [None]:

# Get input from the user for prediction
user_input = {}

print("Enter the required details for prediction:")
for col in X.columns:
    user_input[col] = input(f"Enter value for '{col}': ")

# Create a DataFrame from the user input
user_df = pd.DataFrame([user_input])

# Convert user input to numeric format
for col in user_df.columns:
    if user_df[col].dtype == object:  # Check if the column type is object (string)
        if user_df[col].str.contains('%').any():
            user_df[col] = user_df[col].str.replace('%', '').astype(float)
        elif user_df[col].str.contains('$').any():
            user_df[col] = user_df[col].replace({'\$': '', ',': ''}, regex=True).astype(float)
        else:
            user_df[col] = user_df[col].astype(float)

# Scale the user input data
user_scaled = scaler.transform(user_df)

# Predict with the Random Forest Classifier
user_prediction_rf = clf.predict(user_scaled)

# Predict with the Gradient Boosting Classifier
user_prediction_gb = gb_clf.predict(user_scaled)

# Display the user input and predictions
print("\nUser Input:")
print(user_df)

print("\nPrediction Results:")
print(f"Random Forest Classifier Prediction: {'Yes' if user_prediction_rf[0] == 1 else 'No'}")
print(f"Gradient Boosting Classifier Prediction: {'Yes' if user_prediction_gb[0] == 1 else 'No'}")


Enter the required details for prediction:
