# Checking with models

In [3]:
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Sample input array
input_array = ['ads3', 'sdf3r5', 'cew3f', '2344', 'ads3', 'cew3f', 'cew3f']

# Initialize a LabelEncoder
label_encoder = LabelEncoder()

# Fit the encoder on the input array to map values to integers
encoded_array = label_encoder.fit_transform(input_array)

# Display the encoded array
print(encoded_array)


[1 3 2 0 1 2 2]


### With algorithm ( without any models)

In [5]:
input_array = ['ads3', 'sdf3r5', 'cew3f', '2344', 'ads3']

# Initialize an empty dictionary to store the mappings
value_to_int = {}
int_array = []

# Iterate through the input array
for value in input_array:
    # Check if the value is already in the dictionary
    if value in value_to_int:
        # If yes, append the corresponding integer to the result array
        int_array.append(value_to_int[value])
    else:
        # If no, assign a new integer and append it to the result array
        new_int = len(value_to_int) + 1
        value_to_int[value] = new_int
        int_array.append(new_int)

# Display the integer array
print(int_array)


[1, 2, 3, 4, 1]


## As per our need 2nd approach suits well

# Inserting CSV file and performing the changes to change the data to integers

In [79]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('products_dispatched_wagon_wise.csv')

# Specify the column in which you want to perform the encoding
column_name = 'Mat Code'

# Initialize an empty dictionary to store the mappings
value_to_int = {}
int_array = []

# Iterate through the values in the specified column
for value in df[column_name]:
    # Check if the value is already in the dictionary
    if value in value_to_int:
        # If yes, append the corresponding integer to the result list
        int_array.append(value_to_int[value])
    else:
        # If no, assign a new integer and append it to the result list
        new_int = len(value_to_int) + 1
        value_to_int[value] = new_int
        int_array.append(new_int)

# Update the DataFrame column with the integer values
df[column_name] = int_array


In [8]:
df

Unnamed: 0,Mat Code,Wagon No
0,1,SER55070161847
1,1,SER55070161847
2,1,SER55070161847
3,1,NCR94131810560
4,1,NCR94131810560
...,...,...
479,2,NCR94131363349
480,2,NCR94131363349
481,2,NCR94131363349
482,2,NCR94131363349


In [9]:
df.head(50)

Unnamed: 0,Mat Code,Wagon No
0,1,SER55070161847
1,1,SER55070161847
2,1,SER55070161847
3,1,NCR94131810560
4,1,NCR94131810560
5,1,NCR94131810560
6,1,NCR94131810560
7,1,NCR94131810560
8,1,NCR94131810560
9,1,NCR94131810560


# Grouping every 10 continuous data into 1 group set

In [80]:
# Specify the column in which you want to perform the encoding
column_name = 'Mat Code'

# Initialize an empty dictionary to store the mappings
value_to_int = {}
int_array = []

# Initialize a counter to keep track of the current group
group_counter = 0

# Iterate through the values in the specified column
for value in df[column_name]:
    # Check if the value is already in the dictionary
    if value in value_to_int:
        # If yes, append the corresponding integer to the result list
        int_array.append(value_to_int[value])
    else:
        # If no, assign a new integer and append it to the result list
        new_int = len(value_to_int) + 1
        value_to_int[value] = new_int
        int_array.append(new_int)
    
    # Check if we have processed 10 values, and if so, increment the group counter
    if len(int_array) % 10 == 0:
        group_counter += 1

# Update the DataFrame column with the integer values
df[column_name] = int_array

# Create a new column for the group number
df['Group'] = [(i // 10) + 1 for i in range(len(df))]

# Now, df will have a 'Group' column indicating the group number for every 10 values


In [12]:
df

Unnamed: 0,Mat Code,Wagon No,Group
0,1,SER55070161847,1
1,1,SER55070161847,1
2,1,SER55070161847,1
3,1,NCR94131810560,1
4,1,NCR94131810560,1
...,...,...,...
479,2,NCR94131363349,48
480,2,NCR94131363349,49
481,2,NCR94131363349,49
482,2,NCR94131363349,49


In [13]:
df.head(50)

Unnamed: 0,Mat Code,Wagon No,Group
0,1,SER55070161847,1
1,1,SER55070161847,1
2,1,SER55070161847,1
3,1,NCR94131810560,1
4,1,NCR94131810560,1
5,1,NCR94131810560,1
6,1,NCR94131810560,1
7,1,NCR94131810560,1
8,1,NCR94131810560,1
9,1,NCR94131810560,1


# Now let's try to convert the AlphaNumeric values in Wagon No to integers as we did for Mat Code

In [82]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split


# Specify the column in which you want to perform the encoding
column_name = 'Mat Code'

# Initialize an empty dictionary to store the mappings
value_to_int = {}
int_array = []

# Initialize a counter to keep track of the current group
group_counter = 0

# Iterate through the values in the specified column
for value in df[column_name]:
    # Check if the value is already in the dictionary
    if value in value_to_int:
        # If yes, append the corresponding integer to the result list
        int_array.append(value_to_int[value])
    else:
        # If no, assign a new integer and append it to the result list
        new_int = len(value_to_int) + 1
        value_to_int[value] = new_int
        int_array.append(new_int)
    
    # Check if we have processed 10 values, and if so, increment the group counter
    if len(int_array) % 10 == 0:
        group_counter += 1

# Update the DataFrame column with the integer values
df[column_name] = int_array

# Create a new column for the group number
df['Group'] = [(i // 10) + 1 for i in range(len(df))]

# Define a function to impute missing values using a machine learning model
def impute_missing_values(group):
    # Split the group into rows with missing and non-missing values
    missing_values = group[group['Mat Code'].isnull()]
    non_missing_values = group[~group['Mat Code'].isnull()]
    
    # Select the features (columns) to use for prediction (excluding the target column)
    features = ['Wagon No']  # Add relevant feature columns
    
    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        non_missing_values[features], non_missing_values['Mat Code'], test_size=0.2, random_state=42
    )
    
    # Create a machine learning model (e.g., RandomForestRegressor)
    model = RandomForestRegressor(random_state=42)
    
    # Train the model on non-missing values
    model.fit(X_train, y_train)
    
    # Use the trained model to predict missing values
    predicted_values = model.predict(missing_values[features])
    
    # Update the missing values in the group with the predicted values
    group.loc[group['Mat Code'].isnull(), 'Mat Code'] = predicted_values
    
    return group

# Group the DataFrame by 'Group'
grouped = df.groupby('Group')

# Apply the impute_missing_values function to each group and concatenate the results
imputed_df = grouped.apply(impute_missing_values).reset_index(drop=True)

# Print the first few rows of the imputed DataFrame to the console
print(imputed_df.head())

# Save the imputed DataFrame to a new CSV file
imputed_df.to_csv('imputed_output_file.csv', index=False)


ValueError: could not convert string to float: 'NCR94131810560  '

# since we couldn't convert string to float, because the data is alphanumeric (combination of letters and numbers)

## As the wagon No column is not numeric nor String and It contains AlphaNumeric data. Now let us convert the data to integers like we have done to the other column(Mat Code)

In [83]:
# Specify the columns in which you want to perform the encoding
columns_to_encode = ['Wagon No']

# Initialize empty dictionaries to store the mappings for each column
value_to_int = {column: {} for column in columns_to_encode}
int_arrays = {column: [] for column in columns_to_encode}

# Iterate through the values in the specified columns
for column in columns_to_encode:
    for value in df[column]:
        # Check if the value is already in the dictionary for the specific column
        if value in value_to_int[column]:
            # If yes, append the corresponding integer to the result list
            int_arrays[column].append(value_to_int[column][value])
        else:
            # If no, assign a new integer and append it to the result list
            new_int = len(value_to_int[column]) + 1
            value_to_int[column][value] = new_int
            int_arrays[column].append(new_int)

# Update the DataFrame columns with the integer values
for column in columns_to_encode:
    df[column] = int_arrays[column]


In [19]:
df

Unnamed: 0,Mat Code,Wagon No,Group
0,1,1,1
1,1,1,1
2,1,1,1
3,1,2,1
4,1,2,1
...,...,...,...
479,2,43,48
480,2,43,49
481,2,43,49
482,2,43,49


In [57]:
df.head(50)

Unnamed: 0,Mat Code,Wagon No,Group
0,1,1,1
1,1,1,1
2,1,1,1
3,1,2,1
4,1,2,1
5,1,2,1
6,1,2,1
7,1,2,1
8,1,2,1
9,1,2,1


# Now let's make some data to Null to validate and predict the missing values using machine learning models and to study ML models

In [86]:
import random

# Specify the column in which you want to make random data null
column_name = 'Mat Code'

# Specify the percentage of values to make null (e.g., 10%)
null_percentage = 15  # Change this to your desired percentage

# Calculate the number of values to make null
num_null_values = int(len(df) * null_percentage / 100)

# Get random indices to set as null values
random_indices = random.sample(range(len(df)), num_null_values)

# Set the selected indices in the specified column to NaN (null)
df.loc[random_indices, column_name] = pd.NA




In [87]:
df

Unnamed: 0,Mat Code,Wagon No,Group
0,1.0,1,1
1,,1,1
2,,1,1
3,1.0,2,1
4,,2,1
...,...,...,...
479,2.0,43,48
480,2.0,43,49
481,2.0,43,49
482,2.0,43,49


# df.head(50)

# Now let's Predict the missing Null values 

## let us use RandomForestRegressor for better results

In [92]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

# Separate rows with missing 'Mat Code' values
missing_values_df = df[df['Mat Code'].isnull()]

# Separate rows without missing 'Mat Code' values
not_missing_values_df = df[~df['Mat Code'].isnull()]

# Define features (X) and target variable (y)
X = not_missing_values_df[['Wagon No', 'Group']]
y = not_missing_values_df['Mat Code']

# Initialize and train a machine learning model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X, y)

# Predict missing 'Mat Code' values
predicted_values = model.predict(missing_values_df[['Wagon No', 'Group']])

# Update the missing 'Mat Code' values in the original DataFrame
df.loc[df['Mat Code'].isnull(), 'Mat Code'] = predicted_values

# Save the updated DataFrame to a new CSV file if needed
df.to_csv('updated_data5.csv', index=False)


In [94]:
df

Unnamed: 0,Mat Code,Wagon No,Group
0,1.0,1,1
1,1.0,1,1
2,1.0,1,1
3,1.0,2,1
4,1.0,2,1
...,...,...,...
479,2.0,43,48
480,2.0,43,49
481,2.0,43,49
482,2.0,43,49


In [95]:
df.head(50)

Unnamed: 0,Mat Code,Wagon No,Group
0,1.0,1,1
1,1.0,1,1
2,1.0,1,1
3,1.0,2,1
4,1.0,2,1
5,1.0,2,1
6,1.0,2,1
7,1.0,2,1
8,1.0,2,1
9,1.0,2,1


# Great We Have predicted the Missing values with almost good accuracy

## Now Let's convert back the integer's to it's normal values