In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib

# Make charts look nice
plt.style.use('seaborn-v0_8')
%matplotlib inline

print("All tools loaded successfully!")

In [None]:

# Step 1: Load our stock data
file_path = r"D:\TCS project\TCS_stock_history - Copy.csv"
stock_data = pd.read_csv(file_path)

print("First few rows of our data:")
print(stock_data.head())
print(f"\nOur data has {stock_data.shape[0]} rows and {stock_data.shape[1]} columns")


In [None]:
stock_data.info()

In [None]:
# Step 2: Clean up the data
"""
Real-world data can be messy. We need to:
- Fix date format so computer understands it's dates
- Handle missing values
- Make sure numbers are actually numbers
"""
# Fix the date column
stock_data['Date'] = pd.to_datetime(stock_data['Date'])

# Sort by date (oldest to newest)
stock_data = stock_data.sort_values('Date')

# Check for missing values
print("Missing values in each column:")
print(stock_data.isnull().sum())

# Fill any missing values with the previous day's data
stock_data = stock_data.fillna(method='ffill')

# Remove any remaining empty rows
stock_data = stock_data.dropna()

print(f"Clean data shape: {stock_data.shape}")

In [None]:
# Step 3: Explore our data - basic statistics
"""
Let's understand what our numbers look like - average prices, highest/lowest values, etc.
This helps us spot any weird patterns or errors.
"""
print("Basic statistics of our stock data:")
print(stock_data[['Open', 'High', 'Low', 'Close', 'Volume']].describe())

print("\nFirst 5 rows after cleaning:")
print(stock_data.head())

In [None]:
# Chart 3: Moving averages (smoother lines that show trends)
stock_data['30_day_avg'] = stock_data['Close'].rolling(window=30).mean()
stock_data['200_day_avg'] = stock_data['Close'].rolling(window=200).mean()
stock_data.mean()

### stock_data.describe()

In [None]:
# Chart 1: Stock price over time
plt.figure(figsize=(12, 6))
plt.plot(stock_data['Date'], stock_data['Close'], color='blue', linewidth=1)
plt.title('TCS Stock Price Over Time')
plt.xlabel('Date')
plt.ylabel('Price (‚Çπ)')
plt.grid(True)
plt.show()

In [None]:
# Chart 2: Trading volume
plt.figure(figsize=(12, 6))
plt.plot(stock_data['Date'], stock_data['Volume'], color='green', alpha=0.7)
plt.title('Trading Volume Over Time')
plt.xlabel('Date')
plt.ylabel('Volume')
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(stock_data['Date'], stock_data['Close'], label='Daily Price', alpha=0.5)
plt.plot(stock_data['Date'], stock_data['30_day_avg'], label='30-day Average', linewidth=2)
plt.plot(stock_data['Date'], stock_data['200_day_avg'], label='200-day Average', linewidth=2)
plt.title('Stock Price with Moving Averages')
plt.xlabel('Date')
plt.ylabel('Price (‚Çπ)')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
numeric_data = stock_data[['Open', 'High', 'Low', 'Close', 'Volume']]
# 2. Calculate correlations
correlation_matrix = numeric_data.corr()
# 3. Create the heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)

# 4. Add title and show
plt.title('How Stock Factors Relate to Each Other')
plt.show()

print("Values close to 1 or -1 mean strong relationship. Values near 0 mean weak relationship.")

In [None]:
# Step 6: Create features for our prediction model
"""
To predict future prices, we need to create helpful indicators from existing data.
We'll add things like:
- Previous day's closing price
- Moving averages  
- Day of week effect
"""
# Add time-based features
stock_data['year'] = stock_data['Date'].dt.year
stock_data['month'] = stock_data['Date'].dt.month
stock_data['day_of_week'] = stock_data['Date'].dt.dayofweek  # Monday=0, Sunday=6

# Add previous day's close
stock_data['previous_close'] = stock_data['Close'].shift(1)

# Add daily price change
stock_data['daily_change'] = stock_data['Close'].pct_change()

# Add momentum indicator (MACD)
stock_data['12_day_ema'] = stock_data['Close'].ewm(span=12).mean()
stock_data['26_day_ema'] = stock_data['Close'].ewm(span=26).mean()
stock_data['macd'] = stock_data['12_day_ema'] - stock_data['26_day_ema']

# Remove rows with missing values from our calculations
stock_data = stock_data.dropna()

print("New features added successfully!")
print(f"Data shape after adding features: {stock_data.shape}")

In [None]:
# Step 7: Prepare data for machine learning
"""
We're setting up our prediction problem:
- Features (X): What we use to make predictions
- Target (y): What we're trying to predict (tomorrow's closing price)
"""
# Choose which features to use for prediction
feature_columns = ['Open', 'High', 'Low', 'Volume', 'previous_close', 
                   '30_day_avg', 'macd', 'day_of_week', 'month']

X = stock_data[feature_columns]  # Our input features
y = stock_data['Close']           # What we want to predict

# Split data into training and testing sets
# We use older data to train, newer data to test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

print(f"Training set: {X_train.shape[0]} days")
print(f"Testing set: {X_test.shape[0]} days")

In [None]:
# Step 8: Scale our data
"""
Machine learning models work better when all numbers are on similar scales.
We'll adjust our data so no single feature dominates just because it has bigger numbers.
"""
scaler = StandardScaler()

# Fit the scaler on training data and transform both sets
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the scaler for future use
joblib.dump(scaler, "price_scaler.joblib")

print("Data scaled successfully!")

In [None]:
# Step 9: Build a simple linear regression model
"""
This is our baseline model - the simplest way to predict prices.
It assumes a straight-line relationship between features and price.
"""
# Create and train the model
linear_model = LinearRegression()
linear_model.fit(X_train_scaled, y_train)

# Make predictions
linear_predictions = linear_model.predict(X_test_scaled)

# Evaluate how good our predictions are
linear_mae = mean_absolute_error(y_test, linear_predictions)
linear_rmse = np.sqrt(mean_squared_error(y_test, linear_predictions))

print("Linear Regression Results:")
print(f"Average prediction error: ‚Çπ{linear_mae:.2f}")
print(f"Root mean squared error: ‚Çπ{linear_rmse:.2f}")

In [None]:
# Step 10: Build a random forest model
"""
Random Forest is more sophisticated - it can capture complex patterns 
by combining many decision trees.
"""
# Create and train the model
forest_model = RandomForestRegressor(
    n_estimators=100,  # Number of trees
    max_depth=15,      # How deep each tree can grow
    random_state=42    # For reproducible results
)
forest_model.fit(X_train_scaled, y_train)

# Make predictions
forest_predictions = forest_model.predict(X_test_scaled)

# Evaluate predictions
forest_mae = mean_absolute_error(y_test, forest_predictions)
forest_rmse = np.sqrt(mean_squared_error(y_test, forest_predictions))

print("Random Forest Results:")
print(f"Average prediction error: ‚Çπ{forest_mae:.2f}")
print(f"Root mean squared error: ‚Çπ{forest_rmse:.2f}")

# Save this model since it's probably our best one
joblib.dump(forest_model, "tcs_forest_model.joblib")

In [None]:
# Step 11: Visualize our predictions vs actual prices
import matplotlib.pyplot as plt

# Reset indices to ensure they align properly
y_test_reset = y_test.reset_index(drop=True)
forest_predictions_series = pd.Series(forest_predictions, index=y_test_reset.index)

# Get dates for our test period - make sure indices align
test_dates = stock_data['Date'].iloc[-len(y_test):].reset_index(drop=True)

plt.figure(figsize=(14, 7))
plt.plot(test_dates, y_test_reset.values, label='Actual Prices', color='blue', linewidth=2)
plt.plot(test_dates, forest_predictions_series.values, label='Predicted Prices', color='red', linestyle='--')
plt.title('TCS Stock: Actual vs Predicted Prices (Random Forest)')
plt.xlabel('Date')
plt.ylabel('Price (‚Çπ)')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Show which model performed better
print("\nModel Comparison:")
print(f"Linear Regression MAE: ‚Çπ{linear_mae:.2f}")
print(f"Random Forest MAE:     ‚Çπ{forest_mae:.2f}")
print(f"Improvement: {((linear_mae - forest_mae) / linear_mae * 100):.1f}% better")

In [None]:
# Step 11: Visualize our predictions vs actual prices
import matplotlib.pyplot as plt

# Simple approach - create a range of indices for plotting
test_dates = range(len(y_test))

plt.figure(figsize=(14, 7))
plt.plot(test_dates, y_test.values, label='Actual Prices', color='blue', linewidth=2)
plt.plot(test_dates, forest_predictions, label='Predicted Prices', color='red', linestyle='--')
plt.title('TCS Stock: Actual vs Predicted Prices (Random Forest)')
plt.xlabel('Test Sample Index')
plt.ylabel('Price (‚Çπ)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Show which model performed better
print("\nModel Comparison:")
print(f"Linear Regression MAE: ‚Çπ{linear_mae:.2f}")
print(f"Random Forest MAE:     ‚Çπ{forest_mae:.2f}")
print(f"Improvement: {((linear_mae - forest_mae) / linear_mae * 100):.1f}% better")

In [None]:
# Step 12: Save our work and summarize
"""
Let's save everything we've created and summarize what we learned.
"""
# Save the linear model too
joblib.dump(linear_model, "tcs_linear_model.joblib")

print("üéâ Analysis Complete! Summary:")
print("‚úì Data loaded and cleaned")
print("‚úì Trends visualized with charts") 
print("‚úì Features created for prediction")
print("‚úì Two models trained and compared")
print("‚úì Best model saved for future use")
print(f"‚úì Final prediction error: ‚Çπ{forest_mae:.2f} per share")

print("\nüìÅ Files created:")
print("- tcs_forest_model.joblib (main prediction model)")
print("- tcs_linear_model.joblib (simple baseline model)") 
print("- price_scaler.joblib (data scaler for new predictions)")

In [None]:
# Step 13: How to use your saved model for new predictions
"""
Once you have new stock data, here's how to use your trained model:
"""
def predict_tcs_price(new_data, model_path="tcs_forest_model.joblib", scaler_path="price_scaler.joblib"):
    """
    Predict TCS closing price for new data
    new_data should have the same columns as our feature_columns
    """
    # Load the model and scaler
    model = joblib.load(model_path)
    scaler = joblib.load(scaler_path)
    
    # Scale the new data
    new_data_scaled = scaler.transform(new_data)
    
    # Make prediction
    prediction = model.predict(new_data_scaled)
    
    return prediction[0]

# Example of how to use it:
print("To make new predictions, use the predict_tcs_price() function")
print("Make sure your new data has these columns:", feature_columns)