# 1. DATA PREPROCESSING STEP

In [None]:
#

In [8]:
import os
import shutil

# Source file path provided by the user (the actual CSV file)
source_file = '/content/Churn_Modelling_CSV.csv'

# Destination file path expected by the existing code
destination_file = 'Churn_Modelling.csv'

# Check if the source CSV file exists
if os.path.exists(source_file):
    # If the destination file already exists from previous attempts, remove it
    if os.path.exists(destination_file):
        os.remove(destination_file)
        print(f"Removed old '{destination_file}'.")

    # Copy the correct CSV file to the expected filename
    shutil.copyfile(source_file, destination_file)
    print(f"Successfully set up '{destination_file}' from '{source_file}'.")
else:
    print(f"Error: Source CSV file '{source_file}' not found. Please ensure the correct CSV file is uploaded.")

Removed old 'Churn_Modelling.csv'.
Successfully set up 'Churn_Modelling.csv' from '/content/Churn_Modelling_CSV.csv'.


In [9]:
#STEP 1 INSTALLING LIBRARIES
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  LabelEncoder, OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

#step 2 load data
data = pd.read_csv('Churn_Modelling.csv')

#STEP 3: SEPARATE FEATURES (X) AND TARGET (y)
#we exclude rownumber,customerid,surname(indices 0,1,2)
#we take data from index 3 upto the last one as features
X=data.iloc[:,3:-1].values
#we take the last column as target
y=data.iloc[:,-1].values

# STEP 4: ENCODING CATEGORICAL DATA
#label encoding the gender column
#gender is at index 2 in our new x matrix (creditscore =0,geography=1,gender =2)
le=LabelEncoder()
X[:,2]= le.fit_transform(X[:,2])
#now female/male are 0/1
#one Hot Encoding the "geography" column
#"geography"is at index 1.it has three categories :french,spain ,germany
#we transform column1 into 3 separate binary columnc
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[1])],remainder='passthrough')
X=np.array(ct.fit_transform(X))


#STEP 5 :SPLITINTO TRAIN AND TEST SET
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

#step 6 :FEATURE SCALLING
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)


#2 ARCHITECTURE OF THE BRAIN


In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F

#define the ANN ARCHITECTURE
class ChurnPredictor(nn.Module):
  def __init__(self):
    super(ChurnPredictor ,self).__init__()

    #hidden layer 1
    #input :12 features (from our preprocessed data)
    #output:8 neurons (arbitrary)
    self.layer1=nn.Linear(in_features =12,out_features=8)
    #hidden layer 2
    #input =8,output=8
    self.layer2=nn.Linear(in_features =8,out_features=8)
    #output latyer
    #input =8,output =1 as only probability is needed
    self.output_layer=nn.Linear(in_features=8,out_features=1)


  def forward(self,x):
    # 1. pass data through layer 1 and relu function
    x=F.relu(self.layer1(x))

    #2. Pass data through layer 2 and relu function
    x=F.relu(self.layer2(x))

    #3. pass through output layer and apply sigmoid  activation
    # sigmoid squashes the result between 0 and 1 (Probability)
    x=torch.sigmoid(self.output_layer(x))

    return x

# instantiate the model
model = ChurnPredictor()
print(model)


ChurnPredictor(
  (layer1): Linear(in_features=12, out_features=8, bias=True)
  (layer2): Linear(in_features=8, out_features=8, bias=True)
  (output_layer): Linear(in_features=8, out_features=1, bias=True)
)


# 3.TRAINING THE  BRAIN

In [11]:
#1. SETUP THE DATA OR PYTORCH
# convert standard numpy arrays into pytorch tensors
#.unsqueeze(1) changes y from [0,1,0] to [[0],[1],[0]]
X_train_tensor = torch.tensor(X_train,dtype= torch.float32)
y_train_tensor= torch.tensor(y_train,dtype=torch.float32).unsqueeze(1)

# 2. DEFINE THE TEACHER AND CORRECTOR
criterion=nn.BCELoss() #binary cross entropy loss (teacher)
optimizer=torch.optim.Adam(model.parameters(),lr=0.01) #adam optimiser (the corrector)

#3. THE TRAINING LOOP
epochs=100 # how many times we go through the dataset
for epochs in range(epochs):
  #A. FORWARD PASS (THE GUESS)
  y_pred=model(X_train_tensor)

  #B. CALCULATE LOSS(THE GRADE)
  loss=criterion(y_pred,y_train_tensor)
  #C. BACKWARD PASS(THE Learning )
  optimizer.zero_grad() #clear previous calculation
  loss.backward() #calculate gradients (how much to adjust each weight)
  optimizer.step() #update weights

  #monitoring
  if(epochs +1)%10 == 0 :
    print(f'Epoch [{epochs+1}/{epochs}] ,Loss:{loss.item():.4f}')











Epoch [10/9] ,Loss:0.6494
Epoch [20/19] ,Loss:0.5494
Epoch [30/29] ,Loss:0.4756
Epoch [40/39] ,Loss:0.4392
Epoch [50/49] ,Loss:0.4220
Epoch [60/59] ,Loss:0.4025
Epoch [70/69] ,Loss:0.3838
Epoch [80/79] ,Loss:0.3691
Epoch [90/89] ,Loss:0.3553
Epoch [100/99] ,Loss:0.3482


# 4 .THE EVALUATION PHASE

In [12]:
# PREPARE THE TEST DATA
X_test_tensor=torch.tensor(X_test, dtype=torch.float32)
y_test_tensor=torch.tensor(y_test,dtype=torch.float32).unsqueeze(1)

#2 evaluation mode
model.eval()

#the exam
with torch.no_grad():
  #make predictions
  y_pred_prob=model(X_test_tensor)

  #convert probabilty to yes or no
  #if prob >0.5,it rounds to 1 and if prob<0.5 it rounds to 0
  y_pred_cls=y_pred_prob.round()

  #grade the exam
  #.eq() compares predictions vs truth sum() counts the matches
  correct_count=y_pred_cls.eq(y_test_tensor).sum().item()
  accuracy = correct_count/y_test.shape[0]
  print(f"test accuracy :{accuracy:.4f}")


test accuracy :0.8545


### WHOLE THEME OR SUMMARY OF WHAT I HAVE DONE HERE IN CODE
1] WE HAVE A PROBLEM IN A BANK THAT MANY CUSTOMERS OF DIFFERENT FEATURES AND BACKGROUND ARE LEAVING IT AS MENTIONED IN CHURN CSV FILE ,WE HAVE TO ANALYSE THAT DATA BY OUR ML MODEL,OUR ML MODEL DRAW INFERENCES OR CONNECTION BETWWEN PEOPLE'S DIFFERENT FEATURES LIKE COUNTRY,INCOMEAND OTHER COLUMN MENTIONED IN CSV FILE AND THEIR PROBABILITY OF LEAVING THE COMPANY .THEN IT WILL ADJUST ITS WEIGHT TO GIVE US ACCURATE PROBABILITIES OF WHETHER A PERSON WILL LEAVE OR NOT BASED ON WHAT IT HAS LEARNED FROM PREVIOUS DATA .NOW IF WE WILL GIVE THIS DATA A NEW PERSON WITH CERTAIN FEATURES IT WILL PREDICT THE PROBABILITY OF THAT PERSON LEAVING OR NOT .

LSTM PROJECT

In [13]:
#WIKI TEXT DATA LOADER
import os
import torch
import zipfile
import urllib.request
from io import open

class Dictionary(object):
    """
    Maintains a bijection between words and unique integer indices.
    """
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)


class Corpus(object):
    """
    Handles file loading, tokenization, and tensorization of the text.
    """
    def __init__(self, path='./data'):
        self.dictionary = Dictionary()

        # Auto-download logic to make this script self-contained
        if not os.path.exists(path):
            os.makedirs(path)

        # We check for the train file. If missing, we download WikiText-2.
        train_path = os.path.join(path, 'wikitext-2.train.txt')
        if not os.path.exists(train_path):
            print("Downloading WikiText-2 dataset...")
            url = "https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/train.txt"
            urllib.request.urlretrieve(url, os.path.join(path, 'wikitext-2.train.txt'))
            urllib.request.urlretrieve(url.replace('train', 'valid'), os.path.join(path, 'wikitext-2.valid.txt'))
            urllib.request.urlretrieve(url.replace('train', 'test'), os.path.join(path, 'wikitext-2.test.txt'))
            print("Download complete.")

        self.train = self.tokenize(os.path.join(path, 'wikitext-2.train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'wikitext-2.valid.txt'))
        self.test = self.tokenize(os.path.join(path, 'wikitext-2.test.txt'))

    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)

        # Add words to the dictionary
        with open(path, 'r', encoding="utf8") as f:
            for line in f:
                words = line.split() + ['<eos>'] # Add End of Sentence token
                for word in words:
                    self.dictionary.add_word(word)

        # Tokenize file content
        with open(path, 'r', encoding="utf8") as f:
            idss = []
            for line in f:
                words = line.split() + ['<eos>']
                ids = []
                for word in words:
                    ids.append(self.dictionary.word2idx[word])
                idss.append(torch.tensor(ids).type(torch.int64))
            ids = torch.cat(idss)

        return ids

In [14]:
#LSTM MODEL ARCHITECTURE
import torch
import torch.nn as nn

class RNNModel(nn.Module):
    """
    A standard LSTM Language Model.

    Structure:
    Embedding Layer -> LSTM Layer(s) -> Linear Decoder -> Output Probabilities
    """
    def __init__(self, vocab_size, embed_size, hidden_size, nlayers, dropout=0.5, tie_weights=False):
        super(RNNModel, self).__init__()

        self.drop = nn.Dropout(dropout)

        # 1. Encoder: Turns word IDs (0, 1, 55) into dense vectors
        self.encoder = nn.Embedding(vocab_size, embed_size)

        # 2. LSTM: The brain. We use 'batch_first=False' (Sequence Length, Batch, Hidden)
        # This is the standard shape for NLP in PyTorch.
        self.rnn = nn.LSTM(embed_size, hidden_size, nlayers, dropout=dropout)

        # 3. Decoder: Transforms hidden states back to vocabulary size
        self.decoder = nn.Linear(hidden_size, vocab_size)

        # "Tied Weights" Optimization
        # If true, the embedding weights and decoder weights share memory.
        # This forces the model to learn a unified semantic space for input and output.
        if tie_weights:
            if hidden_size != embed_size:
                raise ValueError('When using the tied flag, nhid must be equal to emsize')
            self.decoder.weight = self.encoder.weight

        self.init_weights()
        self.hidden_size = hidden_size
        self.nlayers = nlayers

    def init_weights(self):
        """Initialize weights to small uniform values for stability."""
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input_seq, hidden_state):
        """
        input_seq: shape [seq_len, batch_size] containing word IDs
        hidden_state: tuple (h_0, c_0) from previous batch
        """
        # Get embeddings and apply dropout
        emb = self.drop(self.encoder(input_seq))

        # Pass through LSTM
        # output shape: [seq_len, batch_size, hidden_size]
        output, hidden_state = self.rnn(emb, hidden_state)

        # Apply dropout to the output of the LSTM
        output = self.drop(output)

        # Flatten output for the linear layer: [seq_len * batch_size, hidden_size]
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))

        # Return decoded logits and the new hidden state
        # Reshape decoded to [seq_len, batch_size, vocab_size]
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden_state

    def init_hidden(self, batch_size):
        """Creates the initial zero state for the LSTM."""
        weight = next(self.parameters())
        # LSTM needs two hidden states: (h_0, c_0)
        return (weight.new_zeros(self.nlayers, batch_size, self.hidden_size),
                weight.new_zeros(self.nlayers, batch_size, self.hidden_size))

In [15]:
#TRAINING ENGINE
import time
import math
import torch
import torch.nn as nn

# --- Configuration ---
BATCH_SIZE = 20      # How many independent sentences we process in parallel
EVAL_BATCH_SIZE = 10
SEQ_LEN = 35         # BPTT (Backprop Through Time) window size
EMBED_SIZE = 200     # Dimension of word vectors
HIDDEN_SIZE = 200    # Dimension of LSTM memory
LAYERS = 2           # Number of stacked LSTMs
DROPOUT = 0.2        # Regularization
EPOCHS = 5           # (Keep low for demo)
LR = 20.0            # Initial learning rate
CLIP = 0.25          # Gradient clipping threshold

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- Data Preparation ---
print("Loading data...")
corpus = Corpus('./data') # Changed from data_loader.Corpus

def batchify(data, bsz):
    """
    Reshapes the 1D data tensor into a grid of size [N // bsz, bsz].
    This allows us to process 'bsz' streams of text in parallel.
    """
    nbatch = data.size(0) // bsz
    data = data.narrow(0, 0, nbatch * bsz)
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

train_data = batchify(corpus.train, BATCH_SIZE)
val_data = batchify(corpus.valid, EVAL_BATCH_SIZE)
test_data = batchify(corpus.test, EVAL_BATCH_SIZE)

vocab_size = len(corpus.dictionary)
print(f"Vocabulary Size: {vocab_size}")

# --- Model Setup ---
model = RNNModel(vocab_size, EMBED_SIZE, HIDDEN_SIZE, LAYERS, DROPOUT, tie_weights=True).to(device) # Changed from model.RNNModel
criterion = nn.CrossEntropyLoss()

# --- Helper Functions ---

def repackage_hidden(h):
    """
    Wraps hidden states in new Tensors, to detach them from their history.
    Without this, backprop would try to go all the way back to the start of the epoch!
    """
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

def get_batch(source, i):
    """
    Gets a slice of data of length SEQ_LEN.
    Target is simply the input shifted by 1 word.
    """
    seq_len = min(SEQ_LEN, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target

def evaluate(data_source):
    """Evaluates the model on validation or test data."""
    model.eval()
    total_loss = 0.
    hidden = model.init_hidden(EVAL_BATCH_SIZE)

    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, SEQ_LEN):
            data, targets = get_batch(data_source, i)
            output, hidden = model(data, hidden)
            hidden = repackage_hidden(hidden)

            # Reshape output for loss: [seq_len * batch_size, vocab_size]
            output_flat = output.view(-1, vocab_size)
            total_loss += len(data) * criterion(output_flat, targets).item()

    return total_loss / (len(data_source) - 1)

def train():
    """Main training loop."""
    model.train()
    total_loss = 0.
    start_time = time.time()
    hidden = model.init_hidden(BATCH_SIZE)

    for batch, i in enumerate(range(0, train_data.size(0) - 1, SEQ_LEN)):
        data, targets = get_batch(train_data, i)

        # Detach hidden state from previous batch
        hidden = repackage_hidden(hidden)
        model.zero_grad()

        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, vocab_size), targets)
        loss.backward()

        # Clip gradients to prevent "exploding gradient" problem common in LSTMs
        torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP)

        for p in model.parameters():
            p.data.add_(p.grad, alpha=-lr) # Manual SGD update

        total_loss += loss.item()

        if batch % 200 == 0 and batch > 0:
            cur_loss = total_loss / 200
            print(f'| epoch {epoch:3d} | {batch:5d}/{len(train_data)//SEQ_LEN:5d} batches | '
                  f'loss {cur_loss:5.2f} | ppl {math.exp(cur_loss):8.2f}')
            total_loss = 0
            start_time = time.time()

# --- Execution ---
best_val_loss = None
lr = LR

try:
    print("-" * 89)
    for epoch in range(1, EPOCHS + 1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)
        print("-" * 89)
        print(f'| end of epoch {epoch:3d} | time: {time.time() - epoch_start_time:5.2f}s | '
              f'valid loss {val_loss:5.2f} | valid ppl {math.exp(val_loss):8.2f}')
        print("-" * 89)

        # Learning Rate Annealing: If validation loss doesn't improve, lower the learning rate
        if not best_val_loss or val_loss < best_val_loss:
            best_val_loss = val_loss
        else:
            lr /= 4.0

except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

# Final Test
test_loss = evaluate(test_data)
print('=' * 89)
print(f'| End of training | test loss {test_loss:5.2f} | test ppl {math.exp(test_loss):8.2f}')
print('=' * 89)

Using device: cpu
Loading data...
Downloading WikiText-2 dataset...
Download complete.
Vocabulary Size: 33278
-----------------------------------------------------------------------------------------
| epoch   1 |   200/ 2983 batches | loss  7.61 | ppl  2010.44
-----------------------------------------------------------------------------------------
Exiting from training early


KeyboardInterrupt: 

THIS LSTM MODEL IS FOR PREDICTING NEXT OUTPUT FROM PREVIOUS INPUTS USING THE LSTM ARCHITECTURE AND ADVANTAGE IN USING LSTM IT CAN GENERATE TEXT BY MAINTAINING CELL STATE AND FORGET FUNCTION LSTM CAN EASILY MAINTAIN CONTEXT.Language Modelling by LSTM refers to the specific technical process of training an LSTM network to learn the probability distribution of the next word (or character) in a sequence, given the previous words.In simple terms, it is teaching the LSTM to answer the question: "Given the history I have seen so far, what is the most likely word to happen next?"