In [65]:
# installs google-api-python-client, google-auth, google-auth-oauthlib
# !pip install --upgrade google-api-python-client google-auth google-auth-oauthlib google-auth-httplib2

In [66]:
# updates pip
# pip install --upgrade pip
# # restart kernel once updates are done.

In [67]:
pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [37]:
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from io import BytesIO
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as metrics
from xgboost import XGBClassifier
import pickle as pkl
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer  # Importing ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score, classification_report

In [38]:
def load_data(filename='input.txt'):
    config={}
    with open(filename, 'r') as file:
        for line in file:
            key, value = line.strip().split('=')
            config[key] = value
    return config

In [39]:
input_details = load_data()

In [40]:
file_id = input_details['file_id']
print(f' file id is {file_id}')

 file id is 1ZY9Qv5nmDJ0yzffr5qCdHPrWMfbiBf5t


In [41]:
def access_data_from_drive(file_id):
    # Define scope for accessing Google Drive
    SCOPES = ['https://www.googleapis.com/auth/drive.readonly']

    #  Authenticate using OAuth2 credentials
    flow = InstalledAppFlow.from_client_secrets_file('./credentials.json', SCOPES)
    creds = flow.run_local_server(port=0)

    # Build the Google Drive service
    service = build('drive','v3',credentials=creds)

    # File ID from shareable link
    file_id = file_id#'1ZY9Qv5nmDJ0yzffr5qCdHPrWMfbiBf5t'

    # Reuqest file metadata
    file_metadata = service.files().get(fileId=file_id).execute()
    print("File Metadata", file_metadata)

    # Read the file content into memory
    file_content = BytesIO()

    # Request the media content from Google Drive
    request = service.files().get_media(fileId=file_id)
    media_downloader = MediaIoBaseDownload(file_content, request)

    # Download the file content into the buffer
    done = False
    while not done:
        status, done = media_downloader.next_chunk()
        print(f"Download {int(status.progress()*100)}% complete.")


    # Move the buffer's position back to the start
    file_content.seek(0)

    # Process the file content
    df = pd.read_csv(file_content)
    print(df)
    return df

In [42]:
df = access_data_from_drive(file_id)

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=970406091830-342nl3nqlck6q0mscvg8vj0fnm3r36ot.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A57909%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.readonly&state=ewlytpMXllirhTyojEqhdMxzbAkWsr&access_type=offline
File Metadata {'kind': 'drive#file', 'id': '1ZY9Qv5nmDJ0yzffr5qCdHPrWMfbiBf5t', 'name': 'Fertilizer_Prediction.csv', 'mimeType': 'text/csv'}
Download 100% complete.
    Temparature  Humidity   Moisture Soil Type  Crop Type  Nitrogen  \
0            26         52        38     Sandy      Maize        37   
1            29         52        45     Loamy  Sugarcane        12   
2            34         65        62     Black     Cotton         7   
3            32         62        34       Red    Tobacco        22   
4            28         54        46    Clayey      Paddy        35   
..          ...        ...       ...       ..

In [43]:
def data_overview(df):
    
    # print list of columns
    print('List of columns in teh given data are: \n', df.columns)
    
    # change the name of columns to lowercase & replace space with '_'
    df.columns = df.columns.str.strip().str.lower().str.replace(' ','_')
    print('\n List of columns in teh given data are: \n', df.columns)
    
    # details of the data
    print('\n Shape of the data is: \n',df.shape)
    print('\n Data information is: \n', df.info())
    print('\n Five point summary of the data is: \n', df.describe().T)
    print('\n Number of NA values in the data: \n', df.isna().sum())
    print('\n Number of Null values in the data: \n', df.isnull().sum())
    print('\n Number of duplicated records are : \n', df.duplicated().sum())

    return df

In [44]:
df = data_overview(df)

List of columns in teh given data are: 
 Index(['Temparature', 'Humidity ', 'Moisture', 'Soil Type', 'Crop Type',
       'Nitrogen', 'Potassium', 'Phosphorous', 'Fertilizer Name'],
      dtype='object')

 List of columns in teh given data are: 
 Index(['temparature', 'humidity', 'moisture', 'soil_type', 'crop_type',
       'nitrogen', 'potassium', 'phosphorous', 'fertilizer_name'],
      dtype='object')

 Shape of the data is: 
 (99, 9)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   temparature      99 non-null     int64 
 1   humidity         99 non-null     int64 
 2   moisture         99 non-null     int64 
 3   soil_type        99 non-null     object
 4   crop_type        99 non-null     object
 5   nitrogen         99 non-null     int64 
 6   potassium        99 non-null     int64 
 7   phosphorous      99 non-null     int64 
 8   fe

In [45]:
# Custom transformer for handling NaN or Null checking
class NullChecker(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.null_columns_ = X.columns[X.isnull().any()].tolist()
        return self
    
    def transform(self, X):
        X = X.fillna(0)  # Replace NaN with 0 or any other strategy
        return X

In [46]:
# Custom transformer for label encoding
class LabelEncoderTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        print("=== Initiating label encoding ===")
        self.le = LabelEncoder()
        self.obj_col_list = X.select_dtypes(include=['object']).columns.tolist()
        return self
    
    def transform(self, X):
        X_encoded = X.copy()
        for col in self.obj_col_list:
            X_encoded[col] = self.le.fit_transform(X[col])
        print("=== Label encoding is completed ===")
        return X_encoded

In [47]:
# Custom transformer for standard scaling
class StandardScalerTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        print("=== Initiating standard scaling ===")
        self.scaler = StandardScaler()
        self.num_col_list = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
        self.scaler.fit(X[self.num_col_list])
        print("=== Standard scaling is completed ===")
        return self
    
    def transform(self, X):
        X_scaled = X.copy()
        X_scaled[self.num_col_list] = self.scaler.transform(X[self.num_col_list])
        return X_scaled

In [48]:
# Splitter stage as a transformer
class DataSplitter(BaseEstimator, TransformerMixin):
    def __init__(self, target_feature, test_size=0.25, random_state=1):
        self.target_feature = target_feature
        self.test_size = test_size
        self.random_state = random_state
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        y = X[self.target_feature]
        X = X.drop(columns=[self.target_feature])
        return train_test_split(X, y, test_size=self.test_size, random_state=self.random_state)

In [49]:
# # Model training and prediction as a final stage in the pipeline
# class ModelTrainer(BaseEstimator, TransformerMixin):
#     def __init__(self, model='RandomForest'):
#         self.model = RandomForestClassifier(n_estimators=100)# if model == 'RandomForest' else XGBClassifier()
    
#     def fit(self, X, y):
#         self.model.fit(X, y)
#         return self
    
#     def predict(self, X):
#         return self.model.predict(X)

In [50]:
# from sklearn.ensemble import RandomForestClassifier

class ModelTrainer(BaseEstimator, TransformerMixin):
    def __init__(self, 
                 n_estimators=100,
                 max_depth=None,
                 min_samples_split=2,
                 min_samples_leaf=1,
                 criterion='gini',
                 bootstrap=True,
                 model='RandomForest'):
        
        # Initialize the RandomForestClassifier with the provided parameters
        self.model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            criterion=criterion,
            bootstrap=bootstrap
        ) if model == 'RandomForest' else XGBClassifier()

    def fit(self, X, y):
        self.model.fit(X, y)
        return self
    
    def predict(self, X):
        return self.model.predict(X)


In [51]:
# Model saving transformer
class ModelSaver(BaseEstimator, TransformerMixin):
    def __init__(self, filename):
        self.filename = filename
    
    def fit(self, X, y):
        return self
    
    def transform(self, X, y):
        # Save the model to a file
        with open(self.filename, "wb") as file:
            pkl.dump(y, file)  # Save the model (predictions) to file
        return X

In [52]:
# Preparing the pipeline
pipeline = Pipeline([
    ('null_checker', NullChecker()),
    ('label_encoder', LabelEncoderTransformer()),  # Add label encoding as a stage
    ('scaling', StandardScalerTransformer()),  # Add standard scaling as a stage
    ('splitter', DataSplitter(target_feature='fertilizer_name')),
    # ('model_trainer', ModelTrainer(model='RandomForest')),  # Change to 'XGBoost' if needed
    ('model_trainer', ModelTrainer(n_estimators=50, max_depth=10, 
                                   min_samples_split=2, min_samples_leaf=1, 
                                   criterion='gini', bootstrap=True)),
    ('model_saver', ModelSaver(filename='final_rf_model.pkl'))
])

In [53]:
# Step 3: Fit the pipeline using the entire DataFrame
pipeline.named_steps['null_checker'].fit(df)
df_transformed = pipeline.named_steps['null_checker'].transform(df)

# Encoding, Scaling, and Splitting
df_transformed = pipeline.named_steps['label_encoder'].fit_transform(df_transformed)
df_transformed = pipeline.named_steps['scaling'].fit_transform(df_transformed)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = pipeline.named_steps['splitter'].fit_transform(df_transformed)

=== Initiating label encoding ===
=== Label encoding is completed ===
=== Initiating standard scaling ===
=== Standard scaling is completed ===


In [54]:
# Step 4: Fit the model
pipeline.named_steps['model_trainer'].fit(X_train, y_train)

# Step 5: Save the model (after training)
pipeline.named_steps['model_saver'].transform(X_test, pipeline.named_steps['model_trainer'].model)

# Step 6: Make predictions using the test data
y_pred = pipeline.named_steps['model_trainer'].predict(X_test)
print(y_pred)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

[6 3 6 1 0 4 6 6 0 1 1 1 2 3 4 5 2 6 6 5 1 5 5 4 5]
Accuracy: 0.96
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.67      0.80         3
           1       0.80      1.00      0.89         4
           2       1.00      1.00      1.00         2
           3       1.00      1.00      1.00         2
           4       1.00      1.00      1.00         3
           5       1.00      1.00      1.00         5
           6       1.00      1.00      1.00         6

    accuracy                           0.96        25
   macro avg       0.97      0.95      0.96        25
weighted avg       0.97      0.96      0.96        25

