In [10]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from sklearn.metrics import jaccard_score

In [2]:
# Load the CSV data
data = pd.read_csv('sampletest1.csv')

In [3]:
# Separate data into rows with non-null heat numbers and rows with missing heat numbers
data_with_heat = data.dropna(subset=['Heat No'])
data_missing_heat = data[data['Heat No'].isnull()]

In [4]:
# Split the data with non-null heat numbers into features (wagon numbers) and labels (heat numbers)
X_train = data_with_heat['Wagon No']
y_train = data_with_heat['Heat No']

In [5]:
# Create a text processing pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(stop_words=stopwords.words('english'))),
    ('classifier', MultinomialNB())
])

In [6]:
# Train the model
pipeline.fit(X_train, y_train)

In [7]:
# Predict heat numbers for rows with missing heat numbers
predicted_heat_numbers = pipeline.predict(data_missing_heat['Wagon No'])

In [8]:
# Print the predicted values along with their original SNO, wagon numbers, and predicted heat numbers
for sno, wagon_number, predicted_heat in zip(data_missing_heat['SNO'], data_missing_heat['Wagon No'], predicted_heat_numbers):
    print(f"SNO: {sno} - Wagon Number: {wagon_number} - Predicted Heat Number: {predicted_heat}")

SNO: 5 - Wagon Number: NCR94131810560   - Predicted Heat Number: 23F01059
SNO: 13 - Wagon Number: NCR94131810560   - Predicted Heat Number: 23F01059
SNO: 20 - Wagon Number: ECOR56120642279  - Predicted Heat Number: 23E00914
SNO: 32 - Wagon Number: SCR57091810261   - Predicted Heat Number: 2303183
SNO: 37 - Wagon Number: SER94072310525   - Predicted Heat Number: 2303295
SNO: 53 - Wagon Number: ECOR94122111607  - Predicted Heat Number: 23E00910
SNO: 109 - Wagon Number: SER55079960861   - Predicted Heat Number: 2303292
SNO: 185 - Wagon Number: SER94071359136   - Predicted Heat Number: 23E00996
SNO: 262 - Wagon Number: SER94071359136   - Predicted Heat Number: 23E00996
SNO: 280 - Wagon Number: SER94072310327   - Predicted Heat Number: 2303183
SNO: 308 - Wagon Number: SER94072310495   - Predicted Heat Number: 23F01023
SNO: 362 - Wagon Number: ER94021329387    - Predicted Heat Number: 23F01075
SNO: 367 - Wagon Number: ER94021329387    - Predicted Heat Number: 23F01075
SNO: 377 - Wagon Number

In [12]:
# Calculate the Jaccard similarity score for each pair of actual and predicted sets
jaccard_scores = []
for actual_heat, predicted_heat in zip(actual_heat_numbers, predicted_heat_numbers):
    actual_heat_set = set(actual_heat.split(','))
    predicted_heat_set = set(predicted_heat.split(','))
    jaccard_scores.append(jaccard_score(actual_heat_set, predicted_heat_set, average='macro'))


AttributeError: 'float' object has no attribute 'split'

In [11]:
# Calculate the Jaccard similarity score for each pair of actual and predicted sets
jaccard_scores = []
for actual_heat, predicted_heat in zip(actual_heat_numbers, predicted_heat_numbers):
    actual_heat_set = set(actual_heat.split(','))
    predicted_heat_set = set(predicted_heat.split(','))
    jaccard_scores.append(jaccard_score(actual_heat_set, predicted_heat_set, average='macro'))


AttributeError: 'float' object has no attribute 'split'