refer to following url  
https://www.kaggle.com/ahmetarifturkmen/baseline-linear-regression

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
from sklearn.linear_model import Ridge
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text.lower()

def save_pkl(file_name, processor):
    OUTPUT_DIR = './'
    file_name = os.path.join(OUTPUT_DIR,file_name)
    pickle.dump(processor,open(file_name, 'wb'))
    print("FINISH")
def load_pkl(file_path):
    out_object = pickle.load(open(file_path, 'rb'))   
    return out_object

In [None]:
test_df = pd.read_csv("/kaggle/input/jigsaw-toxic-severity-rating/comments_to_score.csv")
valid_df = pd.read_csv("/kaggle/input/jigsaw-toxic-severity-rating/validation_data.csv")
all_txt = pd.DataFrame({'text':pd.concat([valid_df['less_toxic'], valid_df['more_toxic']])}).drop_duplicates().reset_index(drop=True)

In [None]:
train_ext = pd.read_csv("../input/jigsaw-regression-based-data/train_data_version2.csv")
train_ext = train_ext[['text', 'y']]

In [None]:
all_txt['dup_flag'] = 1
train_ext = train_ext.merge(all_txt, on='text', how='left')
train_ext = train_ext.query('dup_flag != 1').reset_index(drop=True)

In [None]:
tfv = TfidfVectorizer(analyzer='char_wb', ngram_range=(2, 5))
X = tfv.fit_transform(train_ext['text'])
w = train_ext["y"].values
y = np.around (w ,decimals = 2)

In [None]:
def create_multi_models(alpha_list, x, y):
    models = []
    for v in alpha_list:
        model = Ridge(alpha=v)
        model.fit(x, y)
        models.append(model)
    return models
def ensemble(data, preds, cv_num):
    pred = np.zeros((data.shape[0],))   
    for v in preds:
      pred += v

    return pred/cv_num

def multi_predict(test_x, models):
    preds = []
    for model in models:
        pred = model.predict(test_x)
        preds.append(pred)
    return preds

In [None]:
alpha_list = [0.5, 1, 2]
ext_models = create_multi_models(alpha_list, X, y)

In [None]:
X_less_ext = tfv.transform(valid_df['less_toxic'])
X_more_ext = tfv.transform(valid_df['more_toxic'])

In [None]:
save_pkl('ext_models.pkl', ext_models)
save_pkl('ext_tfv.pkl', tfv)

# validation

In [None]:
preds_less_ext = multi_predict(X_less_ext, ext_models)
preds_more_ext = multi_predict(X_more_ext, ext_models)

In [None]:
preds_less_ext = ensemble(X_less_ext, preds_less_ext, 3)
preds_more_ext = ensemble(X_more_ext, preds_more_ext, 3)

In [None]:
valid_df['preds_less_ext_kazuma'] = preds_less_ext
valid_df['preds_more_ext_kazuma'] = preds_more_ext

In [None]:
(valid_df['preds_less_ext_kazuma'] < valid_df['preds_more_ext_kazuma']).mean()


In [None]:
valid_df.to_csv('valid_external.csv', index=False)