In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rammp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rammp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rammp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
## import sklearn
## print(sklearn.__version__)


In [3]:
## import pandas
## print(pandas.__version__)

pip install pandas==1.4.2

SyntaxError: invalid syntax (2817781186.py, line 4)

In [4]:
## import flask
## print(flask.__version__)

pip install flask==1.1.2

SyntaxError: invalid syntax (3246517286.py, line 4)

In [5]:
## import joblib
## print(joblib.__version__)

pip install joblib==1.1.0

SyntaxError: invalid syntax (1231657891.py, line 4)

In [6]:
# Load the dataset
df = pd.read_csv('your_dataset.csv')  # Replace with your dataset path

# Display first few rows to verify data
print("Dataset Head:")
print(df.head())
print("\nDataset Info:")
print(df.info())

Dataset Head:
         Al      Cu      Mg        Si      Zn        Zr        Mn        Ag  \
0  0.872024  0.0246  0.0240  0.002498  0.0690  0.001707  0.000778  0.000151   
1  0.889151  0.0186  0.0194  0.004803  0.0605  0.001600  0.001313  0.000170   
2  0.863341  0.0293  0.0271  0.003928  0.0690  0.001275  0.001809  0.000088   
3  0.877796  0.0294  0.0266  0.003395  0.0537  0.001925  0.001598  0.000144   
4  0.890138  0.0128  0.0270  0.001624  0.0609  0.000641  0.001710  0.000136   

         Li        Ca        Fe        Ti        Sn      Cr        Ge  \
0  0.000093  0.000136  0.003793  0.000567  0.000205  0.0002  0.000140   
1  0.000170  0.000171  0.003144  0.000531  0.000199  0.0000  0.000127   
2  0.000200  0.000164  0.002238  0.000123  0.000463  0.0007  0.000093   
3  0.000055  0.000073  0.004255  0.000407  0.000200  0.0003  0.000051   
4  0.000185  0.000072  0.003739  0.000442  0.000209  0.0001  0.000124   

         Sc Crack Susceptibility Level  \
0  0.000109                   

In [7]:
# Separate features, target, and observations
X = df[['Al', 'Zn', 'Cu', 'Mg', 'Si', 'Zr', 'Mn', 'Ag', 'Li', 'Ca', 
        'Fe', 'Ti', 'Sn', 'Cr', 'Ge', 'Sc']]
y = df['Crack Susceptibility Level']
observations = df['Observations']

print("Features shape:", X.shape)
print("Target shape:", y.shape)
print("\nUnique susceptibility levels:", y.unique())

Features shape: (500, 16)
Target shape: (500,)

Unique susceptibility levels: ['Medium' 'Low' 'Very Low' 'Very High' 'High']


In [8]:
# Initialize preprocessors
scaler = StandardScaler()
label_encoder = LabelEncoder()

# Scale features and encode target
X_scaled = scaler.fit_transform(X)
y_encoded = label_encoder.fit_transform(y)

print("Encoded susceptibility levels:", label_encoder.classes_)

Encoded susceptibility levels: ['High' 'Low' 'Medium' 'Very High' 'Very Low']


In [9]:
# Initialize text preprocessing tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens 
             if token not in stop_words and token.isalnum()]
    return ' '.join(tokens)

# Process observations
processed_observations = observations.apply(preprocess_text)

# Create TF-IDF matrix
vectorizer = TfidfVectorizer(max_features=1000)
tfidf_matrix = vectorizer.fit_transform(processed_observations)

print("Processed first observation:")
print("Original:", observations.iloc[0])
print("Processed:", processed_observations.iloc[0])

Processed first observation:
Original: High Zn can contribute to segregation but strengthens alloy.
Processed: high zn contribute segregation strengthens alloy


In [10]:
# Train nearest neighbors model
nn_model = NearestNeighbors(n_neighbors=3, metric='cosine')
nn_model.fit(tfidf_matrix)

# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)
rf_model.fit(X_scaled, y_encoded)

# Calculate feature importance
feature_importance = pd.DataFrame({
    'Element': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("Top 5 most important elements:")
print(feature_importance.head())

Top 5 most important elements:
   Element  Importance
3       Mg    0.212258
2       Cu    0.194502
1       Zn    0.191161
13      Cr    0.179842
0       Al    0.139008


In [11]:
def predict_with_explanation(new_composition):
    """
    Predict crack susceptibility and provide explanation for new composition
    """
    # Scale the input composition
    scaled_comp = scaler.transform(new_composition)
    
    # Make prediction
    pred_encoded = rf_model.predict(scaled_comp)
    pred_label = label_encoder.inverse_transform(pred_encoded)
    prob_scores = rf_model.predict_proba(scaled_comp)
    
    # Find similar cases
    query_vec = vectorizer.transform(processed_observations)
    distances, indices = nn_model.kneighbors(query_vec)
    similar_cases_idx = indices[0]
    
    # Generate explanation
    explanation = "Prediction Analysis:\n"
    explanation += f"1. Predicted Crack Susceptibility: {pred_label[0]}\n\n"
    
    # Add probability scores
    explanation += "2. Confidence Levels:\n"
    for i, label in enumerate(label_encoder.classes_):
        explanation += f"   {label}: {prob_scores[0][i]:.2f}\n"
    
    # Add composition-based reasoning
    explanation += "\n3. Key Influencing Elements:\n"
    top_elements = feature_importance.head(3)
    for _, row in top_elements.iterrows():
        element = row['Element']
        importance = row['Importance']
        value = new_composition[element].values[0]
        explanation += f"   - {element}: {value:.2f}% (Importance: {importance:.3f})\n"
    
    # Add similar cases
    explanation += "\n4. Similar Historical Cases:\n"
    for i, idx in enumerate(similar_cases_idx[:3], 1):
        explanation += f"Case {i}:\n"
        explanation += f"   Susceptibility: {y.iloc[idx]}\n"
        explanation += f"   Observation: {observations.iloc[idx]}\n\n"
    
    return pred_label[0], explanation

In [12]:
# Create a test composition
new_composition = pd.DataFrame({
    'Al': [92.5], 'Zn': [2.5], 'Cu': [1.5], 'Mg': [2.0],
    'Si': [0.2], 'Zr': [0.1], 'Mn': [0.3], 'Ag': [0.1],
    'Li': [0.1], 'Ca': [0.1], 'Fe': [0.2], 'Ti': [0.1],
    'Sn': [0.1], 'Cr': [0.1], 'Ge': [0.1], 'Sc': [0.1]
})

# Get prediction and explanation
prediction, explanation = predict_with_explanation(new_composition)

print("=== Crack Susceptibility Analysis ===")
print(explanation)

=== Crack Susceptibility Analysis ===
Prediction Analysis:
1. Predicted Crack Susceptibility: Very Low

2. Confidence Levels:
   High: 0.12
   Low: 0.08
   Medium: 0.03
   Very High: 0.24
   Very Low: 0.53

3. Key Influencing Elements:
   - Mg: 2.00% (Importance: 0.212)
   - Cu: 1.50% (Importance: 0.195)
   - Zn: 2.50% (Importance: 0.191)

4. Similar Historical Cases:
Case 1:
   Susceptibility: Medium
   Observation: High Zn can contribute to segregation but strengthens alloy.

Case 2:
   Susceptibility: Medium
   Observation: High Zn can contribute to segregation but strengthens alloy.

Case 3:
   Susceptibility: Medium
   Observation: High Zn can contribute to segregation but strengthens alloy.




In [13]:
import joblib

# Save the trained models
joblib.dump(rf_model, 'rf_model.pkl')  # RandomForest model
joblib.dump(scaler, 'scaler.pkl')  # Standard Scaler
joblib.dump(label_encoder, 'label_encoder.pkl')  # Label Encoder
joblib.dump(vectorizer, 'vectorizer.pkl')  # TF-IDF Vectorizer
joblib.dump(nn_model, 'nn_model.pkl')  # Nearest Neighbors Model

print("Models and preprocessors saved successfully!")


Models and preprocessors saved successfully!


In [14]:
import os
print(os.getcwd())

C:\Users\rammp\Documents
