In [80]:
import pandas as pd

In [81]:
df = pd.read_csv('AllGreyhounds.csv')

In [82]:
df.isna().sum()

Greyhound Name           11841
Earmark                      0
Reg Date                     0
Active?                      0
Colour                      17
Sex                          0
Whelp Date                  26
Retirement Type          26905
Retirement Created At    26905
Ret Description          28859
Status Created At         8668
Status Meaning            8668
ICC VolReg               18185
Microchip                 8555
dtype: int64

In [83]:
cols = ['Greyhound Name', 'ICC VolReg', 'Microchip']
df = df.drop(cols, axis=1)

In [84]:
df.rename(columns = {'Active?':'Active'}, inplace = True)

In [85]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier


In [86]:
# Convert categorical columns to numerical representation using LabelEncoder
label_encoder = LabelEncoder()
df["Colour"] = label_encoder.fit_transform(df["Colour"])
df["Sex"] = label_encoder.fit_transform(df["Sex"])

# Split the data into training and testing sets
train_df = df.dropna(subset=["Retirement Type"]).copy()
test_df = df[df["Retirement Type"].isnull()].copy()

# Define the features and target
features = ["Active", "Colour", "Sex"]
target = "Retirement Type"

# Create and fit the KNN model
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(train_df[features], train_df[target])

# Predict the missing values for "Retirement Type" in test_df
test_df[target] = knn.predict(test_df[features])

In [87]:
# Update the original DataFrame with the predicted values
df.update(test_df)

In [88]:
df.isna().sum()

Earmark                      0
Reg Date                     0
Active                       0
Colour                       0
Sex                          0
Whelp Date                  26
Retirement Type              0
Retirement Created At    26905
Ret Description          28859
Status Created At         8668
Status Meaning            8668
dtype: int64

In [89]:
df

Unnamed: 0,Earmark,Reg Date,Active,Colour,Sex,Whelp Date,Retirement Type,Retirement Created At,Ret Description,Status Created At,Status Meaning
0,QDBAJ,2015-09-29,True,0.0,0.0,2015-04-05,NEW,,,2021-01-09 08:47:41.636,Licence Unsuspended
1,QPRPR,2016-01-25,True,9.0,1.0,2015-03-17,NEW,,,2021-07-29 05:06:15.017,Transfer is complete
2,QDXXI,2016-01-28,False,9.0,1.0,2015-01-02,NEW,2021-05-22 05:01:43.474,To be re-homed,2021-05-24 05:00:29.518,Transfer is complete
3,QLZRE,2016-03-22,False,3.0,1.0,2014-12-18,RET,2021-10-08 08:47:10.088,As a pet,2021-01-09 11:18:10.541,Licence Unsuspended
4,QSQMK,2016-04-12,True,2.0,0.0,2015-03-26,RET,,,2021-01-09 20:32:22.361,Identified for Racing
...,...,...,...,...,...,...,...,...,...,...,...
46157,ZYHJP,2023-05-02,True,9.0,1.0,2023-05-02,NEW,,,,
46158,ZYTEB,2023-05-02,True,11.0,1.0,2023-05-02,NEW,,,,
46159,ZYKRR,2023-05-03,True,3.0,0.0,2023-05-03,RET,,,,
46160,ZYHKP,2023-05-03,True,3.0,0.0,2023-05-03,RET,,,,


In [90]:
df.to_csv('AllGreyhounds12.csv', sep='\t', encoding='utf-8')

In [91]:
df

Unnamed: 0,Earmark,Reg Date,Active,Colour,Sex,Whelp Date,Retirement Type,Retirement Created At,Ret Description,Status Created At,Status Meaning
0,QDBAJ,2015-09-29,True,0.0,0.0,2015-04-05,NEW,,,2021-01-09 08:47:41.636,Licence Unsuspended
1,QPRPR,2016-01-25,True,9.0,1.0,2015-03-17,NEW,,,2021-07-29 05:06:15.017,Transfer is complete
2,QDXXI,2016-01-28,False,9.0,1.0,2015-01-02,NEW,2021-05-22 05:01:43.474,To be re-homed,2021-05-24 05:00:29.518,Transfer is complete
3,QLZRE,2016-03-22,False,3.0,1.0,2014-12-18,RET,2021-10-08 08:47:10.088,As a pet,2021-01-09 11:18:10.541,Licence Unsuspended
4,QSQMK,2016-04-12,True,2.0,0.0,2015-03-26,RET,,,2021-01-09 20:32:22.361,Identified for Racing
...,...,...,...,...,...,...,...,...,...,...,...
46157,ZYHJP,2023-05-02,True,9.0,1.0,2023-05-02,NEW,,,,
46158,ZYTEB,2023-05-02,True,11.0,1.0,2023-05-02,NEW,,,,
46159,ZYKRR,2023-05-03,True,3.0,0.0,2023-05-03,RET,,,,
46160,ZYHKP,2023-05-03,True,3.0,0.0,2023-05-03,RET,,,,


In [92]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier

# Replace None values in "Ret Description" with an empty string
df["Ret Description"].fillna('', inplace=True)

# Check if there are any missing values in "Ret Description"
if df["Ret Description"].isnull().any():
    # Vectorize "Ret Description" using TF-IDF
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(df["Ret Description"])

    # Split the data into training and testing sets
    train_matrix = tfidf_matrix[df["Ret Description"].notnull()]
    test_matrix = tfidf_matrix[df["Ret Description"].isnull()]

    if len(train_matrix) > 0:
        # Find the k-nearest neighbors
        knn = KNeighborsClassifier(n_neighbors=3)
        knn.fit(train_matrix, df["Ret Description"].dropna())

        # Predict the missing values for "Ret Description" in test_df
        imputed_values = knn.predict(test_matrix)

        # Update the original DataFrame with the imputed values
        df.loc[df["Ret Description"].isnull(), "Ret Description"] = imputed_values

In [93]:
df

Unnamed: 0,Earmark,Reg Date,Active,Colour,Sex,Whelp Date,Retirement Type,Retirement Created At,Ret Description,Status Created At,Status Meaning
0,QDBAJ,2015-09-29,True,0.0,0.0,2015-04-05,NEW,,,2021-01-09 08:47:41.636,Licence Unsuspended
1,QPRPR,2016-01-25,True,9.0,1.0,2015-03-17,NEW,,,2021-07-29 05:06:15.017,Transfer is complete
2,QDXXI,2016-01-28,False,9.0,1.0,2015-01-02,NEW,2021-05-22 05:01:43.474,To be re-homed,2021-05-24 05:00:29.518,Transfer is complete
3,QLZRE,2016-03-22,False,3.0,1.0,2014-12-18,RET,2021-10-08 08:47:10.088,As a pet,2021-01-09 11:18:10.541,Licence Unsuspended
4,QSQMK,2016-04-12,True,2.0,0.0,2015-03-26,RET,,,2021-01-09 20:32:22.361,Identified for Racing
...,...,...,...,...,...,...,...,...,...,...,...
46157,ZYHJP,2023-05-02,True,9.0,1.0,2023-05-02,NEW,,,,
46158,ZYTEB,2023-05-02,True,11.0,1.0,2023-05-02,NEW,,,,
46159,ZYKRR,2023-05-03,True,3.0,0.0,2023-05-03,RET,,,,
46160,ZYHKP,2023-05-03,True,3.0,0.0,2023-05-03,RET,,,,


In [94]:
df.isna().sum()

Earmark                      0
Reg Date                     0
Active                       0
Colour                       0
Sex                          0
Whelp Date                  26
Retirement Type              0
Retirement Created At    26905
Ret Description              0
Status Created At         8668
Status Meaning            8668
dtype: int64

In [95]:
df.to_csv('AllGreyhounds2.csv', sep='\t', encoding='utf-8')